#!/usr/bin/env python
#-*- coding: utf-8 -*-

###
### Script for dividing a corpus consisting of two big files into
### profiles of 1500 items each
###

import os, sys

profilelength = int(sys.argv[1])
corpus = sys.argv[2]
infile = open(sys.argv[3])
outfile = open(sys.argv[4])
transdir = sys.argv[5]

bitextpath = transdir + corpus + '-profiles/'

if not os.path.exists(bitextpath):
   os.makedirs(bitextpath)


indict = {}
x = 1
for line in infile:
   indict[str(x)] = line[:-1]
   x = x+1

outdictkeys = []
outdict = {}
x = 1
wordnr = 0
for line in outfile:
   if not line[:4] == 'EntL':
      outdict[str(x)] = line[:-1]
      outdictkeys.append(str(x))
      wordnr = wordnr + len(line.split())
   x= x+1

average = wordnr/6

x = 0
y = 1
try:
    path = bitextpath + corpus + '1'
    os.mkdir( path, 0775 ) ;
    os.mkdir( path + '/bitext', 0775 );
except:
    pass
orig = open(bitextpath + corpus + '1/bitext/original','w')
obje = open(bitextpath + corpus + '1/bitext/object','w')
#print len(endict.keys())
wordnr = 0
for key in outdictkeys:
    z = len(key)
    myid = '1'
    while z < 7:
        myid = myid + '0'
        z = z+1
    myid = myid + key + '0'
    wordnr = wordnr + len(outdict[key].split())
    if wordnr > average:
       #print y
       wordnr = 0
    if x < profilelength:
#        orig.write('[' + myid + '] ' + indict[key] + ';;MYID=' + key + '\n')
        orig.write(indict[key] + '\n')
#        obje.write('[' + myid + '] ' + outdict[key] + ';;MYID=' + key + '\n')
        obje.write(outdict[key] + '\n')
        x = x + 1
    if x == profilelength:
        y = y+1
        try:
            path = bitextpath + corpus + str(y)
            os.mkdir( path, 0775 ) ;
            os.mkdir( path + '/bitext', 0775 );
        except:
            pass
        orig = open(bitextpath + corpus + str(y) + '/bitext/original','w')
        obje = open(bitextpath + corpus + str(y) + '/bitext/object','w')
        x = 0

print y