#!/usr/bin/perl -w use strict; use Compress::Zlib (); # Copyright (C) 2004 Nicholas Clark # Distribute with same terms as perl5 - GPL or Artistic licence. # Decompression code written while reading Compress::LZW by # Sean O'Rourke & Matt Howard, so strongly influenced by it. # Using chunks of 4 and 1 bits is not as efficient as 8 bits. # TODO Wacky cominations such as 7, 9 and 10 bits, might turn out to be more # effective than bytes my $bits = 8; use constant save=>1; # 0 or 1 sub init_output { [""]; } # Find the number of bits needed for this number sub bits_needed { (unpack "b32", pack "V", $_[0]) =~ /(.*1)/ ? length $1 : die $_[0]; } open C, ">C"; open D, ">D"; sub output { my ($state, $code, $max_code) = @_; die "Code empty" unless length $code; die "Code $code too big" if $code > 65535; my $b = bits_needed($max_code); my $crossover = 1 << ($b - 1); if (save && $code < $crossover && $code > ($max_code - $crossover)) { print C "$code $b $crossover $max_code save\n"; --$b; } else { print C "$code $b $crossover $max_code\n"; } my $new = unpack "b$b", pack "V", $code; $state->[1] .= $new; my $l = length $state->[1]; if ($l > 8) { # Rip multiples of 8 bits off the front $state->[0] .= pack "b*", substr $state->[1], 0, $l & ~7, ""; } } sub final { my $state = shift; return $state->[0] .= pack "b*", $state->[1]; } sub init_input { ["", $_[0]]; } sub getbits { my ($state, $want) = @_; my $l = length $state->[0]; if ($l < $want) { return if !length $state->[1]; # EOF my $add = 1 + (($want - $l) >> 3); # warn "Getting $add $want $l"; # Put some more bits on the bit accumulator $state->[0] .= unpack "b*", substr $state->[1], 0, $add, ""; } substr $state->[0], 0, $want, ""; } sub input { my ($state, $max_code) = @_; my $b = bits_needed($max_code) - save; my $bits = getbits ($state, $b); return unless defined $bits; # warn "$state->[0] ", length $state->[0]; my $code = unpack "V", pack "b32", $bits; # warn "$code '$state->[0]'"; my $crossover = 1 << ($b - 1 + save); if (save && $code > ($max_code - $crossover)) { $b += save; print D "$code $b $crossover $max_code save\n"; } else { $code += $crossover if getbits ($state, 1); $b += save; print D "$code $b $crossover $max_code\n"; } $code; } sub compress { my $input; my $output = init_output; my $next_code = 1<<$bits; # TODO - map these to a frequency table with least likely bytes (or nibbles or # bits) first, based on "average" files (eg generate table by reading every # file on the filesystem of a typical machine) my %codes; my $multiplier = 1; if ($bits == 8) { $input = shift; $codes{chr ($next_code - $_)} = $_ - 1 foreach 1..$next_code; } elsif ($bits == 4) { $input = unpack "h*", shift; $codes{sprintf "%x", ($next_code - $_)} = $_ - 1 foreach 1..$next_code; } elsif ($bits == 1) { $input = unpack "b*", shift; @codes{1, 0} = (0, 1); } elsif ($bits == 12) { $input = shift; my $over = length ($input) % 3; $input .= "\0" x (3 - $over) if $over; $_ = unpack "h*", $input; s/(...)/${1}0/g; $input = pack "h*", $_; $multiplier = 2; foreach (1..$next_code) { my $code = $next_code - $_; # This is all little endian. $codes{chr ($code & 255). chr ($code >> 8)} = $_ - 1; } } else { die "Can't do bits $bits"; } my $accumulator; my $pos = (length $input) / -$multiplier; while ($pos) { $accumulator .= substr($input, $multiplier * $pos++, $multiplier); if (exists $codes{$accumulator}) { # good next; } # Oops. One too many. $codes{$accumulator} = $next_code; # Back up --$pos; substr ($accumulator, -$multiplier) = ""; output($output, $codes{$accumulator}, $next_code); $next_code++; # Yes, we go round the first bit of the loop again once too many. $accumulator = ''; } output($output, $codes{$accumulator}, $next_code); return final($output); } sub uncompress { my $input = init_input @_; my $output; my $multiplier = 1; my %codes; my $next_code = 1<<$bits; # TODO - map these to a frequency table with least likely bytes (or nibbles or # bits) first. if ($bits == 8) { $codes{$_ - 1} = chr ($next_code - $_) foreach 1..$next_code; } elsif ($bits == 4) { $codes{$_ - 1} = sprintf "%x", ($next_code - $_) foreach 1..$next_code; } elsif ($bits == 1) { @codes{1, 0} = (0, 1); } elsif ($bits == 12) { $multiplier = 2; foreach (1..$next_code) { my $code = $next_code - $_; # This is all little endian. $codes{$_ - 1} = chr ($code & 255). chr ($code >> 8); } } else { die "Can't do bits $bits"; } my $prev = $output = $codes{input ($input, $next_code)}; while (1) { # Not quire sure why we need the +1 my $code = input ($input, 1 + $next_code); last unless defined $code; if (exists $codes{$code}) { my $this = $codes{$code}; $codes{$next_code++} = $prev . substr ($this, 0, $multiplier); $prev = $this; } else { die "$next_code $code" unless $next_code == $code; $prev .= substr ($prev, 0, $multiplier); $codes{$next_code++} = $prev; } $output .= $prev; } if ($bits == 8) { return $output; } elsif ($bits == 4) { pack "h*", $output; } elsif ($bits == 1) { pack "b*", $output; } elsif ($bits == 12) { $_ = unpack "h*", $output; s/(...)./${1}/g; $output = pack "h*", $_; $output; } } sub test { my ($name, $input) = @_; my $compress = compress $input; my $def = Compress::Zlib::compress($input); printf "We manage %d, Zlib managed %d\n", length $compress, length $def; #print "$compress\n"; my $output = uncompress $compress; printf "Output Was %d now %d\n", length $input, length $output; # FIXME. Need to encode the padding in the output stream somehow. # Can possibly do by tracking how many trailing bits we have short of a # whole new quantum of bits, and using this as the byte chop count. # If this doesn't work for all cases, will need to round up to # LCM (8, quantum) and then add 1 more quantum which hold the chop. substr ($output, length $input) = ""; print $output eq $input ? "Pass\n" : "Fail\n"; } # Test code - compress ourselves. seek DATA, 0, 0 or die $!; undef $/; test ("self", ); open FH, $^X or die; test ($^X,); __END__