#!/usr/bin/perl
use strict;
use Encode;
#use utf8;
use Text::ChaSen; #apt-get install libtext-chasen-perl
my $sent = $ARGV[0] || "今日もしないね";
my $cost = $ARGV[1] || 4000;
my $fsep = "\t";
my $wsep = "\n";
my $format = join $fsep,
# "%ps", # start position (byte)
# "%pe", # end position (byte)
"%m", # surface form
"%y", # yomi
"%M", # lemma
"%P-"; # POS, separated by "-"
$format .= $wsep;
my $res = Text::ChaSen::getopt_argv('chasen-perl', '-j', '-F', $format);
my $euc=$sent;
## fixme use unicode properly
#Encode::from_to($euc, "utf-8", "euc-jp");
my $str = Text::ChaSen::sparse_tostr($euc);
#Encode::from_to($str, "euc-jp", "utf-8");
print "$str\n";
my @segments = split "$wsep", $str;
my $sid = 0;
pop @segments; ### lose the EOS
print "\n";
printf "\n", 0, scalar @segments;
foreach my $segment (@segments) {
if ($segment) {
my ($surface, $yomi, $lemma, $pos) = split /$fsep/, $segment;
printf "",
## fixme add cfrom cto
$sid, $sid, $sid +1;
print "$surface";
print "\n";
# printf "",
printf "",
$sid, $sid, $sid, $sid +1;
print "$pos";
print "\n";
$sid++;
}
}
print "\n";
# use MeCab;
# use Encode;
# use utf8;
# my %Node;
# my %NodeDist;
# my $m = new MeCab::Tagger ("-l3"); # return the lattice
# #my $sent = "今日もしないとね"; #example1
# #my $n = $m->parseToNode ("東京都"); #
# #my $sent = "東京";
# $euc=$sent;
# ## fixme use unicode properly
# Encode::from_to($euc, "utf-8", "euc-jp");
# my $n = $m->parseToNode ($euc); #
# ## fixme use unicode properly
# $length = scalar split //,$sent;
# #print "S Length :$length:\n";
# # go through once, to the end
# for (; $n->{next}; $n = $n->{next} ) {};
# # search backwords, depth first
# &search ($n);
# ### print header
# $date = `date --rfc-3339=seconds`;
# chomp $date;
# print "\n";
# print " \n";
# print " \n";
# print " \n";
# print " mecab2saf.perl\n";
# print " $date\n";
# print " \n";
# print "";
# ### print lattice
# for my $id (sort {$a <=> $b} keys %Node) {
# $cto = $length - $NodeDist{$id};
# $cfrom = $cto - $NodeD{$id};
# # $cfrom = $length - $NodeDist{$id}
# $surface = $Node{$id}->{surface};
# Encode::from_to($surface, "euc-jp", "utf-8");
# $features = $Node{$id}->{feature};
# Encode::from_to($features, "euc-jp", "utf-8");
# # ($pos1, $pos2, $pos3);
# $width = scalar split //, $surface;
# $cfrom = $cto - $width;
# if ($width) {
# # print "";
# print &xml_escape($surface); ### escape me
# # print " ($features)"; ### escape me
# # print "$surface"; ### escape me
# print "\n";
# }
# }
# ### print footer
# print " \n\n";
# sub search ()
# {
# my $rnode = shift @_;
# my $rid = $rnode->{id}; # Right Node ID
# my $bcost = $rnode->{cost}; # Right Node cost
# $Node{$rid} = $rnode; # mark nodes we have seen
# # print "searched: $rid\n";
# # 右 Node に接続するすべての Path について..
# for (my $p = $rnode->{lpath}; $p; $p = $p->{lnext}) {
# my $lnode = $p->{lnode};
# if ($lnode->{isbest} || $p->{cost} <= $cost) { # look for costs below this
# my $lid = $lnode->{id};
# # print "\n$rid is good, linked to $lid\n";
# my $utfsurf = $rnode->{surface};
# Encode::from_to($utfsurf, "euc-jp", "utf-8");
# my $width = scalar split //, $utfsurf;
# if ($rid) {
# $NodeDist{$lid} = $NodeDist{$rid} + $width;
# }else {
# $NodeDist{$lid} = 0;
# }
# # print "$lid [", $length - $NodeDist{$lid}, " - ",
# # $length - $NodeDist{$rid},"] ($utfsurf)\n";
# next if (defined $Node{$lid}); # ignore ones we've done already
# &search ($lnode); # recurse
# }
# }
# }
# # for my $id (sort {$a <=> $b} keys %Node) {
# # print "Id :", $id, ":\n";
# # print "Cost :", $Node{$id}->{cost}, ":\n";
# # print "Wcost :", $Node{$id}->{wcost}, ":\n";
# # print "Surface :", $Node{$id}->{surface}, ":\n";
# # print "Feature :", $Node{$id}->{feature}, ":\n";
# # print "Lpath :", $Node{$id}->{lpath}, ":\n";
# # # print "Rpath :", $Node{$id}->{rpath}->{id}, ":\n";
# # # print $Node{$id}->{isbest} ? "* " : " "; # "*" if it is part of the best parse
# # # nodeid TAB surface TAB POS1,POS2,POS3,POS5,Paradigm,Inflection,Base,Kana,Pronunciation,
# # # print $id, "\t", $Node{$id}->{surface}, "\t", $Node{$id}->{feature}, "\n";
# # }
# sub xml_escape {
# my $str = shift;
# $str =~ s/\&/&/g;
# $str =~ s/\>/>/g;
# $str =~ s/\</g;
# $str =~ s/\"/"/g;
# $str =~ s/\'/'/g;
# return $str;
# }
# #
# # ii
# # NNP
# # ii
# # NNP
# #