C++ code colored by C++2HTML
#!/bin/perl
#
# AHO-CORASICK-TOKENIZER on words, will also work on any delemiter.
# INPUT(1): Set of patterns (sequence of words) 'P'
# INPUT(2): Text 'T'
# OUTPUT: report all the occuring patterns -- we don't even
# want to know their locations.
#
# NOTE: We don't want to worry much about the time required to
# construct the failure function. All we need is to improve the
# scan speed.
#
# PATTERN FILE FORMAT:
# <annotation> $$$ pattern
#
# vamsik@engr.uconn.edu 04/08/2010
#
($#ARGV == 1) or die("USAGE: perl <name> pattern_file text_file");
open(PATTERN_FILE, $ARGV[0]) or die($!);
open(TEXT_FILE, $ARGV[1]) or die($!);
%AC_TREE;
%PATTERN_HASH;
$ANNOT_DELIM = ";;a;;";
$FAILED_DELIM = ";;f;;";
$HEIGHT_DELIM = ";;h;;"; #to move within the text
sub BuildKwordTree{
# root of the Aho-Corasick tree #
my @params = @_;
my $ac_root = $params[0];
my $PFILE = $params[1];
my ($line, $pword_list, $pattern, $curr_node, $pword);
while(<$PFILE>){
$line = $_; chomp($line);
if($line =~ /([^\$]+)\$\$\$ (.*)/){
$annotation = $1;
$pattern = $2;
# split the pattern into words delimited by any #
# non-newline character. #
@pword_list = split(/\s+/, $pattern);
$curr_node = $ac_root;
foreach $pword (@pword_list){
${$curr_node}{$pword} =
{} unless exists ${$curr_node}{$pword};
$curr_node = ${$curr_node}{$pword};
}
# put-the annotation into the last-node of the key-word tree
${$curr_node}{$ANNOT_DELIM} = $annotation;
}
}
}
#
# Build the failure function level-by level
#
#
sub BuildFailureFunction{
my @params = @_;
my $ac_root = $params[0];
my ($k, @bfs_list, $keyword);
#
# Incremental algorithm; For level-1 its the $ac_root itself
#
${$ac_root}{$FAILED_DELIM} = $ac_root;
${$ac_root}{$HEIGHT_DELIM} = 0;
foreach $k (keys %$ac_root){
if(($k eq $FAILED_DELIM) or ($k eq $ANNOT_DELIM)
or ($k eq $HEIGHT_DELIM)){
next;
}
${${$ac_root}{$k}}{$FAILED_DELIM} = $ac_root;
${${$ac_root}{$k}}{$HEIGHT_DELIM} = 1;
push(@bfs_list, ${$ac_root}{$k});
}
while($#bfs_list >= 0){
$nref = shift(@bfs_list); #node
foreach $keyword (keys %$nref){
if(($keyword eq $FAILED_DELIM) or
($keyword eq $ANNOT_DELIM) or
($keyword eq $HEIGHT_DELIM)){
next;
}
$nnref = ${$nref}{$keyword};
$fpref = ${$nref}{$FAILED_DELIM};
while(!(exists ${$fpref}{$keyword})
and $fpref != $ac_root){
$fpref = ${$fpref}{$FAILED_DELIM};
}
if(exists ${$fpref}{$keyword}){
${$nnref}{$FAILED_DELIM} = ${$fpref}{$keyword};
}else{
${$nnref}{$FAILED_DELIM} = $ac_root;
}
${$nnref}{$HEIGHT_DELIM} = ${$nref}{$HEIGHT_DELIM}+1;
push(@bfs_list, $nnref);
}
}
}
#
# Reads word delimited text from the stream or a file
#
sub MatchWordDelimText{
my @params = @_;
my $ac_root = $params[0];
my $stream = $params[1];
my ($curr_state, $annot, $sword, $c, $height);
my $line;
while(<$stream>){
$line = $_; chomp($line);
$curr_state = $ac_root;
@swords = split(/\s+/, $_);
$c = 0;
while($c <= $#swords){
$sword = $swords[$c];
# Change the state of the automaton by reading the input
# print "I:$sword C:$c 1:$curr_state ";
if(exists ${$curr_state}{$sword}){
$curr_state = ${$curr_state}{$sword};
$c++;
}else{
if($curr_state == $ac_root){
$c++;
}
$curr_state = ${$curr_state}{$FAILED_DELIM};
}
# print "2:$curr_state \n";
if(exists ${$curr_state}{$ANNOT_DELIM}){
$annot = ${$curr_state}{$ANNOT_DELIM};
print "$annot \n";
}
}
}
}
sub StressTestMatchWordDelimText{
my @params = @_;
my $ac_root = $params[0];
my $stream = $params[1];
my ($curr_state, $annot, $sword, $c, $height);
my $line;
my ($stress_me, $ss);
$line = <$stream>;
print "ABSTRACT: $line\n";
chomp($line);
$stress_me = 1000000;
for($s=0; $s< $stress_me; $s++){
if($s%100000 == 0){
print "iteration $s \n";
}
$curr_state = $ac_root;
@swords = split(/\s+/, $_);
$c = 0;
while($c <= $#swords){
$sword = $swords[$c];
# Change the state of the automaton by reading the input
# print "I:$sword C:$c 1:$curr_state ";
if(exists ${$curr_state}{$sword}){
$curr_state = ${$curr_state}{$sword};
$c++;
}else{
if($curr_state == $ac_root){
$c++;
}
$curr_state = ${$curr_state}{$FAILED_DELIM};
}
# print "2:$curr_state \n";
if(exists ${$curr_state}{$ANNOT_DELIM}){
$annot = ${$curr_state}{$ANNOT_DELIM};
# print "$annot \n";
}
}
}
}
#
# Just to see how the tree looks -- for viewing pleasure :)
#
sub PrintKwordTree{
my @params = @_;
my $ac_root = $params[0];
my $level = $params[1];
my ($k, $aref, $annot, $j, $nref, $fref, $height);
foreach $k (keys %$ac_root){
if(($k eq $ANNOT_DELIM)
or ($k eq $FAILED_DELIM) or ($k eq $HEIGHT_DELIM)){
next;
}
for($j=0; $j<$level; $j++){
print " ";
}
$fref = ${${$ac_root}{$k}}{$FAILED_DELIM};
$nref = ${$ac_root}{$k};
$height = ${$nref}{$HEIGHT_DELIM};
print "-$k-> N=$nref F=$fref h=$height\n";
PrintKwordTree(${$ac_root}{$k}, $level+1);
}
if(exists ${$ac_root}{$ANNOT_DELIM}){
$annot = ${$ac_root}{$ANNOT_DELIM};
for($j=0; $j<$level; $j++){
print " ";
}
print "; $annot ;\n\n";
return;
}
}
#
# A simple unit-test
#
sub main{
BuildKwordTree(\%AC_TREE, PATTERN_FILE);
# PrintKwordTree(\%AC_TREE, 0);
BuildFailureFunction(\%AC_TREE);
# PrintKwordTree(\%AC_TREE, 0);
# Now build the failure function.
print "please enter the words\n";
# MatchWordDelimText(\%AC_TREE, STDIN);
StressTestMatchWordDelimText(\%AC_TREE, TEXT_FILE);
print "\n....THANK YOU......\n";
}
main();
No comments:
Post a Comment