Close

Source Code

A project log for Lossy Text Compression

"And God saw that it was ace."

greg-kennedyGreg Kennedy 05/08/2015 at 16:460 Comments

No idea where to put source code on this site, so I'll just make a project log of it.

#!/usr/bin/perl
use strict;
use warnings;

### LOSSY TEXT COMPESSION
# Conventional compression utilities rely on a "dictionary" to expand
#   indices into complete tokens.
# This compressor instead uses a thesaurus to minimize output file size.
my %thes;

{
  # Open moby thesaurus.
  # You may substitute your own thesaurus file.  The format is
  #  word,synonym1,synonym2,...\r
  open (T, '<', 'mthes/mobythes.aur') or die "Couldn't open thesaurus: $!\n";

  # Moby thesaurus is CR-terminated
  local $/ = "\r";

  # Read line at a time
  while (my $line = <T>)
  {
    chomp $line;
    # Pull keyword and then all synonyms.
    if ($line =~ m/^([^,]+),(.+)$/)
    {
      my @syns = split(/,/, $2);
      # identify shortest syn
      my $best_syn = $1;
      foreach my $syn (@syns)
      {
        # Straight abbreviations aren't very funny
        next if ($syn eq uc($syn));
        # Two letter synonyms aren't readable
        next if (length($syn) < 3);
        # Optionally, filter stupidly overcommon synonyms
        #next if ($syn eq 'air' || $syn eq 'bed');
        if (length($syn) < length($best_syn)) { $best_syn = $syn; }
      }

      # Save RAM: store syn only if it's better than the keyword
      if ($best_syn ne $1)
      {
        $thes{$1} = $best_syn;
      }
    } else {
      # Your thesaurus is broken.
      die "%% $line\n";
    }
  }
  close (T);
}

# Some functions
sub try_lookup($)
{
  my $tok = shift;
  # case-sensitive lookup first
  if (exists $thes{$tok}) { return $thes{$tok}; }
  # case-insensitive lookup
  elsif (exists $thes{lc($tok)})
  {
    # Retrieve the lookup match
    my $ret = $thes{lc($tok)};

    # First letter was lowercase already
    if (substr($tok,0,1) ne uc(substr($tok,0,1))) { return $ret; }
    # Correct capitalization on substituted first letter
    return uc(substr($ret,0,1)) . substr($ret,1);
  }
  # nomatch, just return the word
  return $tok;
}

sub try_plural_lookup($$$)
{
  my $word = shift;
  my $plural_suffix = shift;
  my $plural_subst = shift;

  my $suf_len = length($plural_suffix);

  # See if word ending matches supplied plural_suffix.
  if (substr($word,-$suf_len) eq $plural_suffix) {
    # Suffix appears to match.  Create test tok by removing suffix
    #  and putting plural_subst on (e.g. piracIES -> piracY)
    my $tok = substr($word,0,length($word) - $suf_len) . $plural_subst;

    my $test = try_lookup($tok);
    # A smarter compressor would know the correct ending
    #  Then again, 's' is smaller than 'es' or 'ies' etc
    if ($test ne $tok) { return $test . 's'; }
  }
  return $word;
}

# Great, thesaurus populated, now read from stdin
while (my $line = <STDIN>)
{
  chomp $line;
  # Split into words
  my @tokens = split(/\s+/, $line);

  my @result;
  foreach my $token (@tokens)
  {
    # Take apart the token - remove leading and trailing quotes, punct, etc
    if ($token =~ m/^(.*?)([\w\d-]+)(.*?)$/)
    {
      my $tok = $2;

      # Substitution rules.
      $tok = try_lookup($tok);
      if ($tok eq $2) { $tok = try_plural_lookup($tok,'ies','y'); }
      if ($tok eq $2) { $tok = try_plural_lookup($tok,'es',''); }
      if ($tok eq $2) { $tok = try_plural_lookup($tok,'s',''); }

      push(@result, $1 . $tok . $3);
    } else {
      # Uncompressible punctuation or something
      push(@result, $token);
    }
  }
  print join(' ', @result) . "\n";
}

Discussions