Friday, April 09, 2010

[TECH] Super-fast tokenizing of sequence word patterns

C++ code colored by C++2HTML
#!/bin/perl 
#
# AHO-CORASICK-TOKENIZER on words, will also work on any delemiter.
# INPUT(1): Set of patterns (sequence of words) 'P'
# INPUT(2): Text 'T' 
# OUTPUT: report all the occuring patterns -- we don't even 
# want to know their locations. 
#
# NOTE: We don't want to worry much about the time required to 
# construct the failure function. All we need is to improve the
# scan speed.
#
# PATTERN FILE FORMAT:
# <annotation> $$$ pattern
#
# vamsik@engr.uconn.edu 04/08/2010
#

($#ARGV == 1) or die("USAGE: perl <name> pattern_file text_file");

open(PATTERN_FILE, $ARGV[0]) or die($!);

open(TEXT_FILE, $ARGV[1]) or die($!);
%AC_TREE;
%PATTERN_HASH;

$ANNOT_DELIM = ";;a;;";
$FAILED_DELIM = ";;f;;"; 
$HEIGHT_DELIM = ";;h;;"; #to move within the text

sub BuildKwordTree{

# root of the Aho-Corasick tree #
	my @params = @_;
	my $ac_root = $params[0];

	my $PFILE = $params[1];
	my ($line, $pword_list, $pattern, $curr_node, $pword);

	
	while(<$PFILE>){
		$line = $_; chomp($line);

		if($line =~ /([^\$]+)\$\$\$ (.*)/){
			$annotation = $1;

			$pattern = $2;
# split the pattern into words delimited by any #
# non-newline character. #
			@pword_list = split(/\s+/, $pattern);

			$curr_node = $ac_root; 
			foreach $pword (@pword_list){
				${$curr_node}{$pword} = 
					{} unless exists ${$curr_node}{$pword};

				$curr_node = ${$curr_node}{$pword};
			}
# put-the annotation into the last-node of the key-word tree 
			${$curr_node}{$ANNOT_DELIM} = $annotation;
		}
	}
}

#
# Build the failure function level-by level
#
#
sub BuildFailureFunction{
	my @params = @_;
	my $ac_root = $params[0];

	my ($k, @bfs_list, $keyword); 
#
# Incremental algorithm; For level-1 its the $ac_root itself  
#
	${$ac_root}{$FAILED_DELIM} = $ac_root;

	${$ac_root}{$HEIGHT_DELIM} = 0;
	foreach $k (keys %$ac_root){

		if(($k eq $FAILED_DELIM) or ($k eq $ANNOT_DELIM)
			or ($k eq $HEIGHT_DELIM)){

			next;
		}
		${${$ac_root}{$k}}{$FAILED_DELIM} =  $ac_root;

		${${$ac_root}{$k}}{$HEIGHT_DELIM} = 1;
		push(@bfs_list, ${$ac_root}{$k});
	}

	
	while($#bfs_list >= 0){
		$nref = shift(@bfs_list); #node

		foreach $keyword (keys %$nref){

			if(($keyword eq $FAILED_DELIM) or 
					($keyword eq $ANNOT_DELIM) or 
					($keyword eq $HEIGHT_DELIM)){

				next;
			}

			$nnref = ${$nref}{$keyword};

			$fpref = ${$nref}{$FAILED_DELIM}; 
			while(!(exists ${$fpref}{$keyword})

				and $fpref != $ac_root){
				$fpref = ${$fpref}{$FAILED_DELIM};
			}

			if(exists ${$fpref}{$keyword}){
				${$nnref}{$FAILED_DELIM} = ${$fpref}{$keyword};
			}else{

				${$nnref}{$FAILED_DELIM} = $ac_root; 
			}
			${$nnref}{$HEIGHT_DELIM} = ${$nref}{$HEIGHT_DELIM}+1;

			push(@bfs_list, $nnref);
		}
	}
}
#
# Reads word delimited text from the stream or a file
#
sub MatchWordDelimText{
	my @params = @_;

	my $ac_root = $params[0];
	my $stream = $params[1];

	my ($curr_state, $annot, $sword, $c, $height);

	my $line;

	while(<$stream>){
		$line = $_; chomp($line);

		$curr_state = $ac_root;
		@swords = split(/\s+/, $_);

		$c = 0; 
		while($c <= $#swords){

			$sword = $swords[$c];
# Change the state of the automaton by reading the input 
#			print "I:$sword C:$c 1:$curr_state ";
			if(exists ${$curr_state}{$sword}){

				$curr_state = ${$curr_state}{$sword};
				$c++;
			}else{

				if($curr_state == $ac_root){
					$c++;
				}
				$curr_state = ${$curr_state}{$FAILED_DELIM};
			}

#			print "2:$curr_state \n";
			if(exists ${$curr_state}{$ANNOT_DELIM}){
				$annot = ${$curr_state}{$ANNOT_DELIM};

				print "$annot \n";
			}
		}
	}
}

sub StressTestMatchWordDelimText{
	my @params = @_;

	my $ac_root = $params[0];
	my $stream = $params[1];

	my ($curr_state, $annot, $sword, $c, $height);

	my $line;
	my ($stress_me, $ss);
	$line = <$stream>;

	print "ABSTRACT: $line\n";
	chomp($line);
	$stress_me = 1000000;

	for($s=0; $s< $stress_me; $s++){
		if($s%100000 == 0){

			print "iteration $s \n";
		}
		$curr_state = $ac_root;
		@swords = split(/\s+/, $_);

		$c = 0; 
		while($c <= $#swords){

			$sword = $swords[$c];
# Change the state of the automaton by reading the input 
#			print "I:$sword C:$c 1:$curr_state ";
			if(exists ${$curr_state}{$sword}){

				$curr_state = ${$curr_state}{$sword};
				$c++;
			}else{

				if($curr_state == $ac_root){
					$c++;
				}
				$curr_state = ${$curr_state}{$FAILED_DELIM};
			}

#			print "2:$curr_state \n";
			if(exists ${$curr_state}{$ANNOT_DELIM}){
				$annot = ${$curr_state}{$ANNOT_DELIM};

#				print "$annot \n";
			}
		}
	}
}

#
# Just to see how the tree looks -- for viewing pleasure :)
#
sub PrintKwordTree{
	my @params = @_;

	my $ac_root = $params[0]; 
	my $level = $params[1];

	my ($k, $aref, $annot, $j, $nref, $fref, $height);

	foreach $k (keys %$ac_root){
		if(($k eq $ANNOT_DELIM) 
			or ($k eq $FAILED_DELIM) or ($k eq $HEIGHT_DELIM)){

			next;
		}
		for($j=0; $j<$level; $j++){

			print " ";
		}
		$fref = ${${$ac_root}{$k}}{$FAILED_DELIM};

		$nref = ${$ac_root}{$k};
		$height = ${$nref}{$HEIGHT_DELIM};

		print "-$k-> N=$nref F=$fref h=$height\n";
		PrintKwordTree(${$ac_root}{$k}, $level+1);
	}

	

	if(exists ${$ac_root}{$ANNOT_DELIM}){
		$annot = ${$ac_root}{$ANNOT_DELIM};

		for($j=0; $j<$level; $j++){
			print " ";
		}

		print "; $annot ;\n\n";
		return;
	}
}
#
# A simple unit-test
#
sub main{
	BuildKwordTree(\%AC_TREE, PATTERN_FILE);

#	PrintKwordTree(\%AC_TREE, 0);
	BuildFailureFunction(\%AC_TREE);
#	PrintKwordTree(\%AC_TREE, 0);
# Now build the failure function.
	print "please enter the words\n";
#	MatchWordDelimText(\%AC_TREE, STDIN);
	StressTestMatchWordDelimText(\%AC_TREE, TEXT_FILE);

	print "\n....THANK YOU......\n";
}
main();