import sys from argparse import ArgumentParser import pickle from os import walk, path, makedirs ''' The script performs both packing and unpacking ''' SPLITS = ['development', 'evaluation', 'training'] def check_file_count(task, file_counts): events = {} events['training'] = 410 events['development'] = 83 events['evaluation'] = 0 opinion = {} opinion['training'] = 204 opinion['development'] = 49 opinion['evaluation'] = 0 negation = {} negation['training'] = 1 negation['development'] = 1 negation['evaluation'] = 1 if task == "events": for k in file_counts.keys(): if len(file_counts[k]) != events[k]: print('''The total number of parsed files in %s doesn't match the original number of files'''% (k)) sys.exit(1) if task == "opinion": for k in file_counts.keys(): if len(file_counts[k]) != opinion[k]: print('''The total number of parsed files %d in %s doesn't match the original number of files %d''' % (len(file_counts[k]), k, opinion[k])) filename_key = pickle.load(open(task + "_fk.p", "rb")) for filename in filename_key: split = filename.split('/')[0] name = filename.split('/')[-1] if split == k and name not in file_counts[k]: print(name) sys.exit(1) if task == "negation": for k in file_counts.keys(): if len(file_counts[k]) != negation[k]: print('''The total number of parsed files in %s doesn't match the original number of files'''% (k)) sys.exit(1) def pack(epe_path, task): try: filename_key = pickle.load(open(task + "_fk.p", "rb")) except IOError: print("%s filename-key pickle file must be in the same directory" % (task + "_fk.p")) sys.exit(1) with open(task + '.txt', 'w') as outfile: for split in SPLITS: for root, dirs, files in walk(path.join(epe_path, task, split)): for ix, filename in enumerate(files): if filename.endswith(".txt"): p_filename = split + "/" + filename separator = "\n\n\nDocument " + filename_key[p_filename] + " ends.\n\n\n" with open(path.join(root, filename)) as infile: for line in infile: outfile.write(line) outfile.write(separator) print("Finished packing files for %s" % (task)) def unpack(infile, task, outpath, format='CoNLL-U'): file_counts = {} extension = "." + infile.split('.')[-1] for split in SPLITS: if not path.exists(path.join(outpath, task, split)): makedirs(path.join(outpath, task, split)) file_counts[split] = [] try: key_filename = pickle.load(open(task + "_kf.p", "rb")) except IOError: print("%s key-filename pickle files must be in the same directory" % (task + "_kf.p")) sys.exit(1) lines = "" if format == 'plain': with open(infile, 'r') as file: for line in file: if line.strip('\n') in key_filename: p_filename = key_filename[line.strip('\n')] split = p_filename.split('/')[0] filename = p_filename.split('/')[-1] with open(path.join(outpath, task, split, filename), 'w') as outfile: outfile.write(lines) lines = "" else: lines += line elif format == 'CoNLL-U': ignore = 0 delimiter = False with open(infile, 'r') as file: for line in file: if len(line) < 2 or line[0] == '#': ignore = len(line) lines += line delimiter = False elif line.split('\t')[1] in key_filename: p_filename = key_filename[line.split('\t')[1]] split = p_filename.split('/')[0] filename = '.'.join(p_filename.split('/')[-1].split('.')[:-1]) + extension with open(path.join(outpath, task, split, filename), 'w') as outfile: outfile.write(lines[: - ignore]) file_counts[split].append('.'.join(p_filename.split('/')[-1].split('.')[:-1]) + ".txt") # set delimiter to True to ignore the rest of the separator # line, i.e.: "ends." delimiter = True lines = "" ignore = 0 else: if not delimiter: lines += line # keep track of the length of the previous line to ignore it # when writing out the file; this is important to ignore the # token "Document" in the separator. ignore = len(line) check_file_count(task, file_counts) # elif format == 'CoNLL': # ignore = 0 # save = True # with open(infile, 'r') as file: # for line in file: # if len(line.split("# text = ")) > 1: # key = line.split("# text = ")[1].strip() # if key in key_filename: # p_filename = key_filename[key] # split = p_filename.split('/')[0] # filename = '.'.join(p_filename.split('/')[-1].split('.')[:-1]) + extension # with open(path.join(outpath, task, split, filename), 'w') as outfile: # outfile.write(lines[: - ignore]) # file_counts[split].append('.'.join(p_filename.split('/')[-1].split('.')[:-1]) + ".txt") # lines = "" # ignore = 0 # save = False # else: # lines += line # elif line[0] == '#': # ignore += len(line) # lines += line # elif len(line) < 2: # if not save: # lines = "" # save = True # lines += line # else: # if save: # lines += line # ignore = 0 # check_file_count(task, file_counts) def main(): argparser = ArgumentParser(description=__doc__) argparser.add_argument('-mode', choices=['pack', 'unpack'], default='pack', help='''Script mode: pack multiple files into one, or unpack one file to multiple ones''') argparser.add_argument('--path', default="../", help='''Path to EPE directory. Default value assumes running in EPE's bin directory''') argparser.add_argument('--task', choices=['events', 'opinion', 'negation'], default=None, help='''Specifies the task for which to run the script. If no value is passed, the files of all the tasks are packed. Required in unpack mode''') argparser.add_argument('--outpath', default=None, help='''Path to unpack files. Required in unpack mode''') argparser.add_argument('--infile', default=None, help='''Full path to file to unpack. Required in unpack mode''') args = argparser.parse_args(sys.argv[1:]) if args.mode == 'pack': if args.task: pack(args.path, args.task) else: for task in ['events', 'opinion', 'negation']: pack(args.path, task) elif args.mode == 'unpack' and (args.infile is None or args.task is None or args.outpath is None): argparser.error('''unpack mode requires --infile, --outpath and --task. See pack.py -h for more''') elif args.mode == 'unpack' and args.infile and args.task and args.outpath: unpack(args.infile, args.task, args.outpath) if __name__ == '__main__': main()