No idea where to put source code on this site, so I'll just make a project log of it.
#!/usr/bin/perl
use strict;
use warnings;
### LOSSY TEXT COMPESSION
# Conventional compression utilities rely on a "dictionary" to expand
# indices into complete tokens.
# This compressor instead uses a thesaurus to minimize output file size.
my %thes;
{
# Open moby thesaurus.
# You may substitute your own thesaurus file. The format is
# word,synonym1,synonym2,...\r
open (T, '<', 'mthes/mobythes.aur') or die "Couldn't open thesaurus: $!\n";
# Moby thesaurus is CR-terminated
local $/ = "\r";
# Read line at a time
while (my $line = <T>)
{
chomp $line;
# Pull keyword and then all synonyms.
if ($line =~ m/^([^,]+),(.+)$/)
{
my @syns = split(/,/, $2);
# identify shortest syn
my $best_syn = $1;
foreach my $syn (@syns)
{
# Straight abbreviations aren't very funny
next if ($syn eq uc($syn));
# Two letter synonyms aren't readable
next if (length($syn) < 3);
# Optionally, filter stupidly overcommon synonyms
#next if ($syn eq 'air' || $syn eq 'bed');
if (length($syn) < length($best_syn)) { $best_syn = $syn; }
}
# Save RAM: store syn only if it's better than the keyword
if ($best_syn ne $1)
{
$thes{$1} = $best_syn;
}
} else {
# Your thesaurus is broken.
die "%% $line\n";
}
}
close (T);
}
# Some functions
sub try_lookup($)
{
my $tok = shift;
# case-sensitive lookup first
if (exists $thes{$tok}) { return $thes{$tok}; }
# case-insensitive lookup
elsif (exists $thes{lc($tok)})
{
# Retrieve the lookup match
my $ret = $thes{lc($tok)};
# First letter was lowercase already
if (substr($tok,0,1) ne uc(substr($tok,0,1))) { return $ret; }
# Correct capitalization on substituted first letter
return uc(substr($ret,0,1)) . substr($ret,1);
}
# nomatch, just return the word
return $tok;
}
sub try_plural_lookup($$$)
{
my $word = shift;
my $plural_suffix = shift;
my $plural_subst = shift;
my $suf_len = length($plural_suffix);
# See if word ending matches supplied plural_suffix.
if (substr($word,-$suf_len) eq $plural_suffix) {
# Suffix appears to match. Create test tok by removing suffix
# and putting plural_subst on (e.g. piracIES -> piracY)
my $tok = substr($word,0,length($word) - $suf_len) . $plural_subst;
my $test = try_lookup($tok);
# A smarter compressor would know the correct ending
# Then again, 's' is smaller than 'es' or 'ies' etc
if ($test ne $tok) { return $test . 's'; }
}
return $word;
}
# Great, thesaurus populated, now read from stdin
while (my $line = <STDIN>)
{
chomp $line;
# Split into words
my @tokens = split(/\s+/, $line);
my @result;
foreach my $token (@tokens)
{
# Take apart the token - remove leading and trailing quotes, punct, etc
if ($token =~ m/^(.*?)([\w\d-]+)(.*?)$/)
{
my $tok = $2;
# Substitution rules.
$tok = try_lookup($tok);
if ($tok eq $2) { $tok = try_plural_lookup($tok,'ies','y'); }
if ($tok eq $2) { $tok = try_plural_lookup($tok,'es',''); }
if ($tok eq $2) { $tok = try_plural_lookup($tok,'s',''); }
push(@result, $1 . $tok . $3);
} else {
# Uncompressible punctuation or something
push(@result, $token);
}
}
print join(' ', @result) . "\n";
}
Discussions
Become a Hackaday.io Member
Create an account to leave a comment. Already have an account? Log In.