use SegmentTree;
use SuffixArray;
use FlowParser;
use Alphabet;
use strict;
use Time::HiRes qw/gettimeofday/;
use Encode;
sub help {
	print encode "cp866",decode("cp1251",join("",<DATA>));
}
if ($#ARGV >= 0 && $ARGV[0] eq "-h"){help();exit;}
my $data_file =  $ARGV[0] or "IE_large";
my $c = SuffixArray->get(".\\nloglcp\\$data_file.func.txt",".\\nloglcp\\$data_file.smlcp.txt");
my $data = $c->get_str();
print "funcs count:".scalar(@$data),$/;
my $sm = $c->GetArray();
my $lc = $c->GetLcp();
my @res;
my @rev_sm;
my $start_time = gettimeofday();
for (my $i=0; $i<=$#$sm;$i++){
	$rev_sm[$sm->[$i]]=$i;
}
my @srt = ();
for (my $i=0; $i<=$#$sm;$i++){
	if (!defined($srt[$lc->[$i]])){
		$srt[$lc->[$i]] = [];
	}
	push @{$srt[$lc->[$i]]},$sm->[$i];
}
my @hit = (0) x $#$sm+1;
my $INIT_COUNT = $ARGV[1] or 300;
my $count = $INIT_COUNT;
my $minfreq = $ARGV[2] or 20;
my %features;
print "Terms for $data_file cnt:$INIT_COUNT frq:$minfreq\n";
for (my $k = $#srt; $k >= 0; $k--){
	next if (!defined($srt[$k]));
	#print join ",",@{$srt[$k]},$/;
	for my $el (@{$srt[$k]}){
		my @res = @{$data}[$sm->[$rev_sm[$el]]..$sm->[$rev_sm[$el]] + $lc->[$rev_sm[$el]]-1];
		next if defined($features{join ',',@res});
		next if ($hit[$sm->[$rev_sm[$el]]] ==1 or $hit[$sm->[$rev_sm[$el]]+$lc->[$rev_sm[$el]]-1] ==1 or
			$hit[$sm->[$rev_sm[$el]+1]] ==1 or $hit[$sm->[$rev_sm[$el]+1]+$lc->[$rev_sm[$el]]-1] ==1);
		
		my ($imin,$imax) = ($rev_sm[$el],$rev_sm[$el]);
		while ($imin > 1 && $lc->[$imin-1] >= $lc->[$rev_sm[$el]]){$imin--;}
		while ($imax < $#$sm && $lc->[$imax+1] >= $lc->[$rev_sm[$el]]){$imax++;}
		my $freq = 0;
		my @dfs;
		for (my $t = $imin; $t <= $imax; $t++){ 
			next if ($hit[$sm->[$t]] ==1 || $hit[$sm->[$t]+$lc->[$rev_sm[$el]]-1] ==1 ||
				$hit[$sm->[$t+1]] ==1 || $hit[$sm->[$t+1]+$lc->[$rev_sm[$el]]-1] ==1);
			for (my $i=$sm->[$t]; $i < $sm->[$t]+$lc->[$rev_sm[$el]]; $i++){
				$hit[$i] = 1;
			}
			push @dfs, $sm->[$t];
			$freq++;
		}
		#print $el." ".$freq.$/;
		if ($freq < $minfreq){
			for my $pos (@dfs){
				for (my $i=$pos; $i < $pos + $lc->[$rev_sm[$el]]; $i++){
					$hit[$i] = 0;
				}
			}
			next;
		}
		my $m = join ',',@res;
		$features{$m}{count} = $freq;
		$features{$m}{max} = 0;
		$features{$m}{min} = 0;
		$features{$m}{avg} = 0;
		
		@dfs = sort {$a <=> $b} @dfs;
		for (my $mi = 0; $mi < $#dfs; $mi++){
			my $diff = abs($dfs[$mi]-$dfs[$mi+1]);
			if ($features{$m}{max} < $diff){
				$features{$m}{max} = $diff;
			}
			if ($features{$m}{min} == 0 or $features{$m}{min} > $diff){
				$features{$m}{min} = $diff;
			}
			$features{$m}{avg} += $diff;
		}
		print "Found ".scalar(split ',',$m)." $features{$m}{count} $features{$m}{avg} $features{$m}{max}\n";
		#$features{$m}{avg} = $features{$m}{avg}/$features{$m}{count};
		$count--;
		last if ($count <= 0);
	}
	last if ($count <= 0);
	last if ($k < 5);
}
$start_time = gettimeofday()-$start_time;
open w, ">",".\\terms\\$data_file $minfreq $INIT_COUNT.terms.txt";
print w "$data_file=$start_time\n";
print  "$data_file=$start_time\n";
for (sort {scalar(split ',',$b) <=> scalar(split ',',$a)} keys %features){
	print w scalar(split ',',$_)." ".$_." ".$features{$_}{count}." max:".$features{$_}{max}." avg:".int($features{$_}{avg}/$features{$_}{count})." min:".$features{$_}{min},$/;
}
__END__
ExtractTerms.pl -      
	ExtractTerms.pl Name Count minfreq
	Name -  ,    (      nloglcp)
	Count -   
	minfreq -    