An exercise in simplicity

I have some Perl code.  As an exercise, I’d like to get it to the point where I can use it as a real-world example to teach someone Perl.  I have a specific person in mind.  This person last wrote software when PL/I was the big thing.  I’d like someone to help me annotate this code so that a Perl noob (but competent programmer) can understand it without damaging its functionality or unnecessarily increasing verbosity.  Here’s the code in question:

#!/usr/bin/perl
use strict;
use warnings;

##############################################################################
#                                                                            #
# This program is intended to take a plain text document filename as an      #
# argument, read the file into memory, and perform linear-weight word        #
# frequency analysys.  As an option, it will accept a comma-delimited list   #
# of domain-specific keywords that will weigh higher on a linear scale if    #
# found in the document.                                                     #
#                                                                            #
##############################################################################

#                             Perl, 80 columns                               #

##
# NOTES:
# I will use a linear ratio of 10:1 for text:keywords for this iteration of  #
# the build.  Files under 10 words long will have only one keyword.          #
##

# options
my $debug            = 0;    # 1=enable debug messages, 0=disable debug messages
my $DEBUG_OPTION     = "-D"; # Command-line switch to enable debugging
my $STOPWORDS_OPTION = "-w"; # Command-line argument to specift stopwords file
my $DEFAULT_STOPWORDS = "/usr/local/etc/stopwords";

# default stopwords filename

# blibals
my $text;                    # the plain text of the document
my $numkeys;                 # the number of keywords to extract
my $textlen;                 # the number of words in the text
my $inputfile;               # the name of the input file
my $stopwords_file = $DEFAULT_STOPWORDS;

# the name of the file containing the stopwords
my %table;                   # the keywords->counter table
my @words;                   # the keywords array

sub handle_args {

    # filename argument is always the last one in the array
    my $i;
    my $arg;    # since a modifiable @ARGV isn't a given, we need a temp
    my $numargs = $#ARGV + 1;
    die "wrong number of arguments" if $numargs < 1;

    $inputfile = pop;
    $numargs--;
    print "input file: $inputfile\n" if $debug;
    die "empty input file"           if length $inputfile < 1;    # sanity check

    # now iterate through argument vector to parse the rest
    for ( $i = 0 ; $i < $numargs ; $i++ ) {
        $arg = $ARGV[$i];
        chomp $arg;

        if ( $arg eq $DEBUG_OPTION ) {
            $debug = 1;
            next;
        }

        if ( $arg eq $STOPWORDS_OPTION ) {

            # roll forward and check for last arg
            die "too few arguments" if ++$i >= $numargs;

            # get teh stopwords file
            $stopwords_file = $ARGV[$i];
            next;
        }

        # this is a fallthrough for un-handled args
        die "I don't know what $arg means.";
    }

}

sub slurp {
    my $document_text;    # the contents of the text file

    # argument is the file name
    my $numargs = $#_ + 1;
    print "We have $numargs arguments.\n" if $debug;
    die "Wrong number of arguments." unless $numargs == 1;

    # read the file into a bugger
    local $/ = undef;     # unset the input delimiter
    open( DOC, $_[0] ) or die $!;
    $document_text = <DOC>;
    close DOC;

    if ($debug) {
        my $bytes = length $document_text;
        print "Slurped 0x";
        print hex $bytes;
        print " bytes into buffer.\n";

        print $document_text . "\n";
    }

    $document_text;

}

sub debone {
    my $numargs = $#_ + 1;
    die "wrong number of arguments" unless $numargs == 1;
    my $document_text = pop;
    my $stopword;
    my @stopwords;

    #DONE implement command-line option for stopwords file
    open( STOPWORDS, $stopwords_file ) or die $!;
    @stopwords = <STOPWORDS>;
    close STOPWORDS;
    chomp $_ foreach @stopwords;

    foreach $stopword (@stopwords) {
        $document_text =~ s/\b$stopword\b//ig;
    }

    $document_text =~ s/\W/ /g;
    $document_text =~ s/\s+/ /g;

    $document_text;
}

sub pack {

    # takes a single argument - the document text
    my $numargs = $#_ + 1;
    die "wrong number of arguments" unless $numargs == 1;
    my $document_text = pop;
    my @wordlist;
    my @buffer;
    my %grapes;

    @buffer = split(/ /, $document_text );
    foreach (@buffer) {
        push( @wordlist, $_ ) if length $_ > 1;
    }

    @grapes{@wordlist} = (0);
    @wordlist = sort keys %grapes;
}

sub freq {

    # takes two arguments - the keywords list and the document text
    my $numargs = $#_ + 1;
    die "wrong number of arguments" unless $numargs > 1;
    my $document_text = pop;
    my $keyword;
    $textlen = 0;
    my @wordlist = @_;
    my %grapes;

    for ( split(/ /, $document_text ) ) {
        $textlen++;
    }

    foreach $keyword (@wordlist) {
        my $count = 0;
        ++$count while $document_text =~ m/\b$keyword\b/g;
        $grapes{$keyword} = $count / $textlen;
    }

    %grapes;
}

# scaffold
if ($debug) {
    my $numargs = $#ARGV + 1;
    print "/usr/bin/perl got $numargs arguments from the shell.\n";
}

&handle_args(@ARGV);
$text  = &slurp($inputfile);
$text  = &debone($text);
@words = &pack($text);
%table = &freq( @words, $text );

#print $text . "\n" if $debug;
#print $_ . "\n" foreach (@words) if $debug;
my $key;
my $maxkeys = $textlen / 100;
my $count   = 0;
foreach $key ( sort { $table{$b} <=> $table{$a} } keys %table ) {
    if ($debug) {
        print "$key: $table{$key}\n";
    }

    else {
        print "$key";
    }

    $count++;
    if ( $count > $maxkeys ) {
        last;
    }

    else {
        print ',';
    }
}

print "\n" unless $debug;
Advertisements

  1. Leave a comment

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: