/* PredicateFeatureExtractor.java Created on Nov 15, 2006 by Richard Johansson (richard@cs.lth.se). $Log: ExprFeatureExtractor.java,v $ Revision 1.2 2011-05-25 13:01:22 johansson For CL article / CIKM. Revision 1.1 2010-06-07 11:33:30 johansson Added. Revision 1.1 2009/01/16 14:50:55 johansson Added to brenta repository. Revision 1.5 2008/05/16 11:49:50 richard Separate pred pos. Revision 1.4 2008/04/17 14:51:52 richard Replaces numbers. Revision 1.3 2008/04/10 07:12:30 richard New features. Revision 1.2 2008/04/07 08:02:36 richard Support. Revision 1.1 2008/03/18 15:18:48 richard Imported from fnlabeler. Revision 1.8 2007/10/12 06:35:06 richard More features. Revision 1.7 2007/10/08 12:33:02 richard Refactored again to remove atoms. Revision 1.6 2007/10/04 12:28:02 richard Refactored to atom-based. Revision 1.5 2007/08/21 10:21:06 richard Subject in depset. Revision 1.4 2007/08/20 15:03:41 richard Added phrases. Revision 1.3 2007/08/20 12:05:15 richard Restructured task configuration. Gold standard steps. Revision 1.2 2007/08/17 10:54:22 richard Cleaned up. Loads feature declarations. Revision 1.1 2007/08/17 10:15:10 richard Refactored. Revision 1.1 2007/04/05 13:26:42 richard Version used in SemEval. Revision 1.2 2006/11/20 16:19:13 richard Labels a treebank. Revision 1.1 2006/11/15 14:21:46 richard Added the file. */ package mpqa_structlearn.locallink; import java.util.*; import se.lth.cs.nlp.nlputils.ml_long.*; import se.lth.cs.nlp.nlputils.core.*; import se.lth.cs.nlp.nlputils.depgraph.*; import mpqa_structlearn.io.*; /** * Feature extractor that extracts features for predicates, that is those * features that are constant for all arguments. * * @author Richard Johansson (richard@cs.lth.se) */ public class ExprFeatureExtractor extends FeatureExtractor> { private static final long serialVersionUID = 0L; /* Feature indices. */ int P_EXPR_HEAD; int P_EXPR_HEAD_LEMMA; int P_EXPR_HEAD_POS; int P_VOICE; int P_DEP_SUBCAT; int P_DEPS; int P_EXPRTYPE; int P_PARENT_WORD; int P_PARENT_POS; int P_REL_TO_PARENT; int P_DOMINATING_EXPR; //int P_EXPR_WORDS; // 110427 private EnglishFeatures lang; private HashSet deps = new HashSet(); private static final String ROOT = ""; private static final String TRUE = "true"; private static final String FALSE = "false"; private static final boolean REPLACE_NUMBERS = true; /** * Constructs a PredicateFeatureExtractor. * * @param framenet the FrameNet database to use. */ public ExprFeatureExtractor(EnglishFeatures lang, FeatureDeclaration decl) { this.lang = lang; P_EXPR_HEAD = decl.getFeatureIndex("P_EXPR_HEAD"); P_EXPR_HEAD_LEMMA = decl.getFeatureIndex("P_EXPR_HEAD_LEMMA"); P_EXPR_HEAD_POS = decl.getFeatureIndex("P_EXPR_HEAD_POS"); P_VOICE = decl.getFeatureIndex("P_VOICE"); P_DEP_SUBCAT = decl.getFeatureIndex("P_DEP_SUBCAT"); P_DEPS = decl.getFeatureIndex("P_DEPS"); P_EXPRTYPE = decl.getFeatureIndex("P_EXPRTYPE"); P_PARENT_WORD = decl.getFeatureIndex("P_PARENT_WORD"); P_PARENT_POS = decl.getFeatureIndex("P_PARENT_POS"); P_REL_TO_PARENT = decl.getFeatureIndex("P_REL_TO_PARENT"); P_DOMINATING_EXPR = decl.getFeatureIndex("P_DOMINATING_EXPR"); //P_EXPR_WORDS = decl.getFeatureIndex("P_EXPR_WORDS"); } /** * Extracts features to a feature list. * * @param pa * @param featureList the feature list. */ public void extractFeatures(Pair p, FeatureList featureList) { MPQAExpression expr = p.left; MPQASentence sen = p.right; DepNode highest = lang.getHighest(expr.span); featureList.setFeature(P_EXPR_HEAD, normalize(highest.word)); featureList.setFeature(P_EXPR_HEAD_LEMMA, normalize(highest.lemma)); featureList.setFeature(P_EXPR_HEAD_POS, highest.pos); featureList.setFeature(P_VOICE, lang.findVoice(highest)); featureList.setFeature(P_DEP_SUBCAT, lang.findDepSubCat(highest)); featureList.setFeature(P_EXPRTYPE, expr.type); lang.findDeps(highest, true, deps); featureList.setFeature(P_DEPS, deps); DepNode parent = highest.parents[0]; // TODO ska vi hantera koordination p� n�got s�tt? featureList.setFeature(P_REL_TO_PARENT, highest.relations[0]); if(parent.position == 0) { featureList.setFeature(P_PARENT_WORD, ROOT); featureList.setFeature(P_PARENT_POS, ROOT); } else { featureList.setFeature(P_PARENT_WORD, normalize(parent.word)); featureList.setFeature(P_PARENT_POS, parent.pos); } featureList.setFeature(P_DOMINATING_EXPR, getDominatingExpressions(expr, sen)); /*if(P_EXPR_WORDS >= 0) { HashSet s = new HashSet(); for(DepNode n: expr.span) { s.add(normalize(n.lemma)); } featureList.setFeature(P_EXPR_WORDS, s); }*/ } private String normalize(String word) { if(word == null) return ""; word = word.toLowerCase(); if(REPLACE_NUMBERS) word = word.replaceAll("[0-9]", "0"); return word; } private HashSet getDominatingExpressions(MPQAExpression expr, MPQASentence sen) { /* TODO cache this map. */ SetMap m = new SetMap(); for(MPQAExpression e2: sen.exprs) for(DepNode n: e2.span) m.put(n, e2); HashSet out = new HashSet(); for(DepNode n: expr.span) { HashSet s; DepNode d = null; while(n.position > 0) { n = n.parents[0]; s = m.get(n); if(s != null && s.contains(n)) d = null; else if(d == null) d = n; } if(d == null) throw new RuntimeException("dominator == null"); while(d.position > 0) { s = m.get(d); if(s != null) for(MPQAExpression e: s) out.add(e.type); d = d.parents[0]; } } //System.out.println("out = " + out); return out; } }