#!/usr/bin/perl -w
use strict;
use Compress::Zlib ();

# Copyright (C) 2004 Nicholas Clark
# Distribute with same terms as perl5 - GPL or Artistic licence.
# Decompression code written while reading Compress::LZW by
# Sean O'Rourke & Matt Howard, so strongly influenced by it.

# Using chunks of 4 and 1 bits is not as efficient as 8 bits.
# TODO Wacky cominations such as 7, 9 and 10 bits, might turn out to be more
# effective than bytes
my $bits = 8;

use constant save=>1; # 0 or 1
sub init_output {
  [""];
}

# Find the number of bits needed for this number
sub bits_needed {
  (unpack "b32", pack "V", $_[0]) =~ /(.*1)/ ? length $1 : die $_[0];
}

open C, ">C";
open D, ">D";
sub output {
  my ($state, $code, $max_code) = @_;
  die "Code empty" unless length $code;
  die "Code $code too big" if $code > 65535;
  my $b = bits_needed($max_code);
  my $crossover = 1 << ($b - 1);
  if (save && $code < $crossover && $code > ($max_code - $crossover)) {
    print C "$code $b $crossover $max_code save\n";
    --$b;
  } else {
    print C "$code $b $crossover $max_code\n";
  }

  my $new = unpack "b$b", pack "V", $code;
  $state->[1] .= $new;
  my $l = length $state->[1];
  if ($l > 8) {
    # Rip multiples of 8 bits off the front
    $state->[0] .= pack "b*", substr $state->[1], 0, $l & ~7, "";
  }
}
sub final {
  my $state = shift;
  return $state->[0] .= pack "b*", $state->[1];
}

sub init_input {
  ["", $_[0]];
}

sub getbits {
  my ($state, $want) = @_;
  my $l = length $state->[0];
  if ($l < $want) {
    return if !length $state->[1]; # EOF
    my $add = 1 + (($want - $l) >> 3);
    # warn "Getting $add $want $l";
    # Put some more bits on the bit accumulator
    $state->[0] .= unpack "b*", substr $state->[1], 0, $add, "";
  }
  substr $state->[0], 0, $want, "";
}
sub input {
  my ($state, $max_code) = @_;
  my $b = bits_needed($max_code) - save;
  my $bits = getbits ($state, $b);
  return unless defined $bits;
  # warn "$state->[0] ", length $state->[0];
  my $code = unpack "V", pack "b32", $bits;
  # warn "$code '$state->[0]'";

  my $crossover = 1 << ($b - 1 + save);

  if (save && $code > ($max_code - $crossover)) {
    $b += save; print D "$code $b $crossover $max_code save\n";
  } else {
    $code += $crossover if getbits ($state, 1);
    $b += save; print D "$code $b $crossover $max_code\n";
  }
  $code;
}

sub compress {
  my $input;
  my $output = init_output;

  my $next_code = 1<<$bits;
  # TODO - map these to a frequency table with least likely bytes (or nibbles or
  # bits) first, based on "average" files (eg generate table by reading every
  # file on the filesystem of a typical machine)
  my %codes;
  my $multiplier = 1;
  if ($bits == 8) {
    $input = shift;
    $codes{chr ($next_code - $_)} = $_ - 1 foreach 1..$next_code;
  } elsif ($bits == 4) {
    $input = unpack "h*", shift;
    $codes{sprintf "%x", ($next_code - $_)} = $_ - 1 foreach 1..$next_code;
  } elsif ($bits == 1) {
    $input = unpack "b*", shift;
    @codes{1, 0} = (0, 1);
  } elsif ($bits == 12) {
    $input = shift;
    my $over = length ($input) % 3;
    $input .= "\0" x (3 - $over) if $over;
    $_ = unpack "h*", $input;
    s/(...)/${1}0/g;
    $input = pack "h*", $_;
    $multiplier = 2;
    foreach (1..$next_code) {
      my $code = $next_code - $_;
      # This is all little endian.
      $codes{chr ($code & 255). chr ($code >> 8)} = $_ - 1;
    }
  } else {
    die "Can't do bits $bits";
  }

  my $accumulator;
  my $pos = (length $input) / -$multiplier;
  while ($pos) {
    $accumulator .= substr($input, $multiplier * $pos++, $multiplier);
    if (exists $codes{$accumulator}) {
      # good
      next;
    }
    # Oops. One too many.
    $codes{$accumulator} = $next_code;
    # Back up
    --$pos; substr ($accumulator, -$multiplier) = "";
    output($output, $codes{$accumulator}, $next_code);
    $next_code++;
    # Yes, we go round the first bit of the loop again once too many.
    $accumulator = '';
  }
  output($output, $codes{$accumulator}, $next_code);
  return final($output);
}

sub uncompress {
  my $input = init_input @_;
  my $output;

  my $multiplier = 1;
  my %codes;
  my $next_code = 1<<$bits;
  # TODO - map these to a frequency table with least likely bytes (or nibbles or
  # bits) first.
  if ($bits == 8) {
    $codes{$_ - 1} = chr ($next_code - $_) foreach 1..$next_code;
  } elsif ($bits == 4) {
    $codes{$_ - 1} = sprintf "%x", ($next_code - $_) foreach 1..$next_code;
  } elsif ($bits == 1) {
    @codes{1, 0} = (0, 1);
  } elsif ($bits == 12) {
    $multiplier = 2;
    foreach (1..$next_code) {
      my $code = $next_code - $_;
      # This is all little endian.
      $codes{$_ - 1} = chr ($code & 255). chr ($code >> 8);
    }
  } else {
    die "Can't do bits $bits";
  }

  my $prev = $output = $codes{input ($input, $next_code)};
  while (1) {
    # Not quire sure why we need the +1
    my $code = input ($input, 1 + $next_code);
    last unless defined $code;
    if (exists $codes{$code}) {
      my $this = $codes{$code};
      $codes{$next_code++} = $prev . substr ($this, 0, $multiplier);
      $prev = $this;
    } else {
      die "$next_code $code" unless $next_code == $code;
      $prev .= substr ($prev, 0, $multiplier);
      $codes{$next_code++} = $prev;
    }
    $output .= $prev;
  }
  if ($bits == 8) {
    return $output;
  } elsif ($bits == 4) {
    pack "h*", $output;
  } elsif ($bits == 1) {
    pack "b*", $output;
  } elsif ($bits == 12) {
    $_ = unpack "h*", $output;
    s/(...)./${1}/g;
    $output = pack "h*", $_;
  $output;
  }
}

sub test {
  my ($name, $input) = @_;
  my $compress = compress $input;
  my $def = Compress::Zlib::compress($input);
  printf "We manage %d, Zlib managed %d\n", length $compress, length $def;
  #print "$compress\n";
  my $output = uncompress $compress;
  printf "Output Was %d now %d\n", length $input, length $output;
  # FIXME. Need to encode the padding in the output stream somehow.
  # Can possibly do by tracking how many trailing bits we have short of a
  # whole new quantum of bits, and using this as the byte chop count.
  # If this doesn't work for all cases, will need to round up to
  # LCM (8, quantum) and then add 1 more quantum which hold the chop.
  substr ($output, length $input) = "";
  print $output eq $input ? "Pass\n" : "Fail\n";
}

# Test code - compress ourselves.
seek DATA, 0, 0 or die $!;
undef $/;
test ("self", <DATA>);
open FH, $^X or die;
test ($^X,<FH>);

__END__