#!/usr/bin/env python
#-*- coding: utf-8 -*-
#
# Script for producing input for cheap with the -yy -default-les
# turned on. It runs the text from the INPUT file through MeCab and
# represents it with tags in the following format:

# 南アフリカ産の茎のない多肉植物の属 >>
#
# (17, 0, 1, <0:15>, 1, "南アフリカ", 0, "null", "名詞-固有名詞-地域-国+n-n" 1.0000)
# (18, 1, 2, <16:19>, 1, "産", 0, "null", "名詞-接尾-地域+n-n" 1.0000)
# (19, 2, 3, <20:23>, 1, "の", 0, "null", "助詞-連体化+n-n" 1.0000) 
# (20, 3, 4, <24:27>, 1, "茎", 0, "null", "名詞-一般+n-n" 1.0000)
# (21, 4, 5, <28:31>, 1, "の", 0, "null", "助詞-格助詞-一般+n-n" 1.0000)
# (22, 5, 6, <32:38>, 1, "ない", 0, "null", "形容詞-自立+形容詞・アウオ段-基本形" 1.0000)
# (23, 6, 7, <39:51>, 1, "多肉植物", 0, "null", "名詞-一般+n-n" 1.0000)
# (24, 7, 8, <52:55>, 1, "の", 0, "null", "助詞-連体化+n-n" 1.0000)
# (25, 8, 9, <56:59>, 1, "属", 0, "null", "名詞-サ変接続+n-n" 1.0000) 
#
# To execute it, use:
#
# python INPUT > OUTPUT
#
# To install MeCab, try
#
# sudo apt-get install python-yaml
# sudo apt-get install mecab-ipadic-utf8 python-mecab
#
# The following cheap command will parse a file `INFILE' and output a
# profile in `OUTDIR':
#
# cheap -comment-passthrough -mrs -nsolutions=1 -results=1 -packing=15 -timeout=10 -yy -default-les -tsdbdum=OUTDIR -inputfile=INFILE ~/logon/dfki/jacy/japanese.grm &> log

import sys
import xml.etree.ElementTree as ET
import MeCab
import os

infile = open(sys.argv[1])

mecab = MeCab.Tagger('-Ochasen')
wordid = 0
for line in infile:
    charpos = 0
    wordpos = 0
    sent = ''
    node = mecab.parseToNode(line)
    while node:
        word = node.surface
        fields = node.feature.split(",")
        pos = fields[0]
        if not fields[1] == '*':
            pos = pos + '-' + fields[1]
        if not fields[2] == '*':
            pos = pos + '-' + fields[2]
        if not fields[3] == '*':
            pos = pos + '-' + fields[3]
        pos = pos + '+'
        if fields[4] == '*':
            pos = pos + 'n-' 
        else:
            pos = pos + fields[4] + '-'
        if fields[5] == '*':
            pos = pos + 'n'
        else:
            pos = pos + fields[5]
            
        sent = sent + word  + ' '
        node = node.next
        line = sent[1:-2]
        newwordpos = wordpos+1
        newcharpos = charpos + len(word)
        if not charpos == newcharpos and not word == '。':
            print '(' + str(wordid) +', ' + str(wordpos) + ', ' + str(newwordpos) + ', <' + str(charpos) + ':' + str(newcharpos) + '>, 1, "' + word + '", 0, "null", "' + pos + '" 1.0000)',
            wordid = wordid+1
            wordpos = newwordpos
            charpos = newcharpos + 1

    print '\n',