#!/bin/perl # # AHO-CORASICK-TOKENIZER on words, will also work on any delemiter. # INPUT(1): Set of patterns (sequence of words) 'P' # INPUT(2): Text 'T' # OUTPUT: report all the occuring patterns -- we don't even # want to know their locations. # # NOTE: We don't want to worry much about the time required to # construct the failure function. All we need is to improve the # scan speed. # # PATTERN FILE FORMAT: # <annotation> $$$ pattern # # vamsik@engr.uconn.edu 04/08/2010 # ($#ARGV == 1) or die("USAGE: perl <name> pattern_file text_file"); open(PATTERN_FILE, $ARGV[0]) or die($!); open(TEXT_FILE, $ARGV[1]) or die($!); %AC_TREE; %PATTERN_HASH; $ANNOT_DELIM = ";;a;;"; $FAILED_DELIM = ";;f;;"; $HEIGHT_DELIM = ";;h;;"; #to move within the text sub BuildKwordTree{ # root of the Aho-Corasick tree # my @params = @_; my $ac_root = $params[0]; my $PFILE = $params[1]; my ($line, $pword_list, $pattern, $curr_node, $pword); while(<$PFILE>){ $line = $_; chomp($line); if($line =~ /([^\$]+)\$\$\$ (.*)/){ $annotation = $1; $pattern = $2; # split the pattern into words delimited by any # # non-newline character. # @pword_list = split(/\s+/, $pattern); $curr_node = $ac_root; foreach $pword (@pword_list){ ${$curr_node}{$pword} = {} unless exists ${$curr_node}{$pword}; $curr_node = ${$curr_node}{$pword}; } # put-the annotation into the last-node of the key-word tree ${$curr_node}{$ANNOT_DELIM} = $annotation; } } } # # Build the failure function level-by level # # sub BuildFailureFunction{ my @params = @_; my $ac_root = $params[0]; my ($k, @bfs_list, $keyword); # # Incremental algorithm; For level-1 its the $ac_root itself # ${$ac_root}{$FAILED_DELIM} = $ac_root; ${$ac_root}{$HEIGHT_DELIM} = 0; foreach $k (keys %$ac_root){ if(($k eq $FAILED_DELIM) or ($k eq $ANNOT_DELIM) or ($k eq $HEIGHT_DELIM)){ next; } ${${$ac_root}{$k}}{$FAILED_DELIM} = $ac_root; ${${$ac_root}{$k}}{$HEIGHT_DELIM} = 1; push(@bfs_list, ${$ac_root}{$k}); } while($#bfs_list >= 0){ $nref = shift(@bfs_list); #node foreach $keyword (keys %$nref){ if(($keyword eq $FAILED_DELIM) or ($keyword eq $ANNOT_DELIM) or ($keyword eq $HEIGHT_DELIM)){ next; } $nnref = ${$nref}{$keyword}; $fpref = ${$nref}{$FAILED_DELIM}; while(!(exists ${$fpref}{$keyword}) and $fpref != $ac_root){ $fpref = ${$fpref}{$FAILED_DELIM}; } if(exists ${$fpref}{$keyword}){ ${$nnref}{$FAILED_DELIM} = ${$fpref}{$keyword}; }else{ ${$nnref}{$FAILED_DELIM} = $ac_root; } ${$nnref}{$HEIGHT_DELIM} = ${$nref}{$HEIGHT_DELIM}+1; push(@bfs_list, $nnref); } } } # # Reads word delimited text from the stream or a file # sub MatchWordDelimText{ my @params = @_; my $ac_root = $params[0]; my $stream = $params[1]; my ($curr_state, $annot, $sword, $c, $height); my $line; while(<$stream>){ $line = $_; chomp($line); $curr_state = $ac_root; @swords = split(/\s+/, $_); $c = 0; while($c <= $#swords){ $sword = $swords[$c]; # Change the state of the automaton by reading the input # print "I:$sword C:$c 1:$curr_state "; if(exists ${$curr_state}{$sword}){ $curr_state = ${$curr_state}{$sword}; $c++; }else{ if($curr_state == $ac_root){ $c++; } $curr_state = ${$curr_state}{$FAILED_DELIM}; } # print "2:$curr_state \n"; if(exists ${$curr_state}{$ANNOT_DELIM}){ $annot = ${$curr_state}{$ANNOT_DELIM}; print "$annot \n"; } } } } sub StressTestMatchWordDelimText{ my @params = @_; my $ac_root = $params[0]; my $stream = $params[1]; my ($curr_state, $annot, $sword, $c, $height); my $line; my ($stress_me, $ss); $line = <$stream>; print "ABSTRACT: $line\n"; chomp($line); $stress_me = 1000000; for($s=0; $s< $stress_me; $s++){ if($s%100000 == 0){ print "iteration $s \n"; } $curr_state = $ac_root; @swords = split(/\s+/, $_); $c = 0; while($c <= $#swords){ $sword = $swords[$c]; # Change the state of the automaton by reading the input # print "I:$sword C:$c 1:$curr_state "; if(exists ${$curr_state}{$sword}){ $curr_state = ${$curr_state}{$sword}; $c++; }else{ if($curr_state == $ac_root){ $c++; } $curr_state = ${$curr_state}{$FAILED_DELIM}; } # print "2:$curr_state \n"; if(exists ${$curr_state}{$ANNOT_DELIM}){ $annot = ${$curr_state}{$ANNOT_DELIM}; # print "$annot \n"; } } } } # # Just to see how the tree looks -- for viewing pleasure :) # sub PrintKwordTree{ my @params = @_; my $ac_root = $params[0]; my $level = $params[1]; my ($k, $aref, $annot, $j, $nref, $fref, $height); foreach $k (keys %$ac_root){ if(($k eq $ANNOT_DELIM) or ($k eq $FAILED_DELIM) or ($k eq $HEIGHT_DELIM)){ next; } for($j=0; $j<$level; $j++){ print " "; } $fref = ${${$ac_root}{$k}}{$FAILED_DELIM}; $nref = ${$ac_root}{$k}; $height = ${$nref}{$HEIGHT_DELIM}; print "-$k-> N=$nref F=$fref h=$height\n"; PrintKwordTree(${$ac_root}{$k}, $level+1); } if(exists ${$ac_root}{$ANNOT_DELIM}){ $annot = ${$ac_root}{$ANNOT_DELIM}; for($j=0; $j<$level; $j++){ print " "; } print "; $annot ;\n\n"; return; } } # # A simple unit-test # sub main{ BuildKwordTree(\%AC_TREE, PATTERN_FILE); # PrintKwordTree(\%AC_TREE, 0); BuildFailureFunction(\%AC_TREE); # PrintKwordTree(\%AC_TREE, 0); # Now build the failure function. print "please enter the words\n"; # MatchWordDelimText(\%AC_TREE, STDIN); StressTestMatchWordDelimText(\%AC_TREE, TEXT_FILE); print "\n....THANK YOU......\n"; } main();
Algorithms, Theory, Spirituality, Life, Technology, Food and Workout : trying to sort these deterministically in $\Theta(1)$ time (constant time).
Friday, April 09, 2010
[TECH] Super-fast tokenizing of sequence word patterns
Subscribe to:
Posts (Atom)