#!/usr/bin/perl
my $sent = $ARGV[0] || "今日もしないね";
my $cost = $ARGV[1] || 4000;
use MeCab;
use Encode;
use utf8;
my %Node;
my %NodeDist;
my $m = new MeCab::Tagger ("-l3"); # return the lattice
#my $sent = "今日もしないとね"; #example1
#my $n = $m->parseToNode ("東京都"); #
#my $sent = "東京";
$euc=$sent;
## fixme use unicode properly
Encode::from_to($euc, "utf-8", "euc-jp");
my $n = $m->parseToNode ($euc); #
## fixme use unicode properly
$length = scalar split //,$sent;
#print "S Length :$length:\n";
# go through once, to the end
for (; $n->{next}; $n = $n->{next} ) {};
# search backwords, depth first
&search ($n);
### print header
$date = `date --rfc-3339=seconds`;
chomp $date;
print "\n";
print " \n";
print " \n";
print " \n";
print " mecab2saf.perl\n";
print " $date\n";
print " \n";
print "";
### print lattice
for my $id (sort {$a <=> $b} keys %Node) {
$cto = $length - $NodeDist{$id};
$cfrom = $cto - $NodeD{$id};
# $cfrom = $length - $NodeDist{$id}
$surface = $Node{$id}->{surface};
Encode::from_to($surface, "euc-jp", "utf-8");
$features = $Node{$id}->{feature};
Encode::from_to($features, "euc-jp", "utf-8");
# ($pos1, $pos2, $pos3);
$width = scalar split //, $surface;
$cfrom = $cto - $width;
if ($width) {
# print "";
print &xml_escape($surface); ### escape me
# print " ($features)"; ### escape me
# print "$surface"; ### escape me
print "\n";
}
}
### print footer
print " \n\n";
sub search ()
{
my $rnode = shift @_;
my $rid = $rnode->{id}; # Right Node ID
my $bcost = $rnode->{cost}; # Right Node cost
$Node{$rid} = $rnode; # mark nodes we have seen
# print "searched: $rid\n";
# 右 Node に接続するすべての Path について..
for (my $p = $rnode->{lpath}; $p; $p = $p->{lnext}) {
my $lnode = $p->{lnode};
if ($lnode->{isbest} || $p->{cost} <= $cost) { # look for costs below this
my $lid = $lnode->{id};
# print "\n$rid is good, linked to $lid\n";
my $utfsurf = $rnode->{surface};
Encode::from_to($utfsurf, "euc-jp", "utf-8");
my $width = scalar split //, $utfsurf;
if ($rid) {
$NodeDist{$lid} = $NodeDist{$rid} + $width;
}else {
$NodeDist{$lid} = 0;
}
# print "$lid [", $length - $NodeDist{$lid}, " - ",
# $length - $NodeDist{$rid},"] ($utfsurf)\n";
next if (defined $Node{$lid}); # ignore ones we've done already
&search ($lnode); # recurse
}
}
}
# for my $id (sort {$a <=> $b} keys %Node) {
# print "Id :", $id, ":\n";
# print "Cost :", $Node{$id}->{cost}, ":\n";
# print "Wcost :", $Node{$id}->{wcost}, ":\n";
# print "Surface :", $Node{$id}->{surface}, ":\n";
# print "Feature :", $Node{$id}->{feature}, ":\n";
# print "Lpath :", $Node{$id}->{lpath}, ":\n";
# # print "Rpath :", $Node{$id}->{rpath}->{id}, ":\n";
# # print $Node{$id}->{isbest} ? "* " : " "; # "*" if it is part of the best parse
# # nodeid TAB surface TAB POS1,POS2,POS3,POS5,Paradigm,Inflection,Base,Kana,Pronunciation,
# # print $id, "\t", $Node{$id}->{surface}, "\t", $Node{$id}->{feature}, "\n";
# }
sub xml_escape {
my $str = shift;
$str =~ s/\&/&/g;
$str =~ s/\>/>/g;
$str =~ s/\</g;
$str =~ s/\"/"/g;
$str =~ s/\'/'/g;
return $str;
}
#
# ii
# NNP
# ii
# NNP
#