#!/usr/bin/perl my $sent = $ARGV[0] || "今日もしないね"; my $cost = $ARGV[1] || 4000; use MeCab; use Encode; use utf8; my %Node; my %NodeDist; my $m = new MeCab::Tagger ("-l3"); # return the lattice #my $sent = "今日もしないとね"; #example1 #my $n = $m->parseToNode ("東京都"); # #my $sent = "東京"; $euc=$sent; ## fixme use unicode properly Encode::from_to($euc, "utf-8", "euc-jp"); my $n = $m->parseToNode ($euc); # ## fixme use unicode properly $length = scalar split //,$sent; #print "S Length :$length:\n"; # go through once, to the end for (; $n->{next}; $n = $n->{next} ) {}; # search backwords, depth first &search ($n); ### print header $date = `date --rfc-3339=seconds`; chomp $date; print "\n"; print " \n"; print " \n"; print " \n"; print " mecab2saf.perl\n"; print " $date\n"; print " \n"; print ""; ### print lattice for my $id (sort {$a <=> $b} keys %Node) { $cto = $length - $NodeDist{$id}; $cfrom = $cto - $NodeD{$id}; # $cfrom = $length - $NodeDist{$id} $surface = $Node{$id}->{surface}; Encode::from_to($surface, "euc-jp", "utf-8"); $features = $Node{$id}->{feature}; Encode::from_to($features, "euc-jp", "utf-8"); # ($pos1, $pos2, $pos3); $width = scalar split //, $surface; $cfrom = $cto - $width; if ($width) { # print ""; print &xml_escape($surface); ### escape me # print " ($features)"; ### escape me # print "$surface"; ### escape me print "\n"; } } ### print footer print " \n\n"; sub search () { my $rnode = shift @_; my $rid = $rnode->{id}; # Right Node ID my $bcost = $rnode->{cost}; # Right Node cost $Node{$rid} = $rnode; # mark nodes we have seen # print "searched: $rid\n"; # 右 Node に接続するすべての Path について.. for (my $p = $rnode->{lpath}; $p; $p = $p->{lnext}) { my $lnode = $p->{lnode}; if ($lnode->{isbest} || $p->{cost} <= $cost) { # look for costs below this my $lid = $lnode->{id}; # print "\n$rid is good, linked to $lid\n"; my $utfsurf = $rnode->{surface}; Encode::from_to($utfsurf, "euc-jp", "utf-8"); my $width = scalar split //, $utfsurf; if ($rid) { $NodeDist{$lid} = $NodeDist{$rid} + $width; }else { $NodeDist{$lid} = 0; } # print "$lid [", $length - $NodeDist{$lid}, " - ", # $length - $NodeDist{$rid},"] ($utfsurf)\n"; next if (defined $Node{$lid}); # ignore ones we've done already &search ($lnode); # recurse } } } # for my $id (sort {$a <=> $b} keys %Node) { # print "Id :", $id, ":\n"; # print "Cost :", $Node{$id}->{cost}, ":\n"; # print "Wcost :", $Node{$id}->{wcost}, ":\n"; # print "Surface :", $Node{$id}->{surface}, ":\n"; # print "Feature :", $Node{$id}->{feature}, ":\n"; # print "Lpath :", $Node{$id}->{lpath}, ":\n"; # # print "Rpath :", $Node{$id}->{rpath}->{id}, ":\n"; # # print $Node{$id}->{isbest} ? "* " : " "; # "*" if it is part of the best parse # # nodeid TAB surface TAB POS1,POS2,POS3,POS5,Paradigm,Inflection,Base,Kana,Pronunciation, # # print $id, "\t", $Node{$id}->{surface}, "\t", $Node{$id}->{feature}, "\n"; # } sub xml_escape { my $str = shift; $str =~ s/\&/&/g; $str =~ s/\>/>/g; $str =~ s/\ # ii # NNP # ii # NNP #