edu.isi.mavuno.app.ie.ExtractRelations.java Source code

Introduction

Here is the source code for edu.isi.mavuno.app.ie.ExtractRelations.java
Source

/*
 * Mavuno: A Hadoop-Based Text Mining Toolkit
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.isi.mavuno.app.ie;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Element;

import edu.isi.mavuno.extract.Extractor;
import edu.isi.mavuno.input.SentenceSegmentedDocument;
import edu.isi.mavuno.util.ContextPatternWritable;
import edu.isi.mavuno.util.IdWeightPair;
import edu.isi.mavuno.util.Individual;
import edu.isi.mavuno.util.MavunoUtils;
import edu.isi.mavuno.util.Relation;
import edu.isi.mavuno.util.SentenceWritable;
import edu.isi.mavuno.util.TokenFactory;
import edu.isi.mavuno.util.TratzParsedTokenWritable;
import edu.isi.mavuno.util.TypedTextSpan;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;

/**
 * @author metzler
 *
 */
public class ExtractRelations extends Configured implements Tool {
    private static final Logger sLogger = Logger.getLogger(ExtractRelations.class);

    private static final TokenFactory<TratzParsedTokenWritable> TOKEN_FACTORY = new TratzParsedTokenWritable.ParsedTokenFactory();

    public ExtractRelations(Configuration conf) {
        super(conf);
    }

    private static class MyMapper
            extends Mapper<Writable, SentenceSegmentedDocument<TratzParsedTokenWritable>, Text, Text> {

        private static final Text MAVUNO_SOURCE = new Text("isi:mavuno");

        private static final Text DEFAULT_TYPE = new Text("ANY");
        private static final Text O = new Text("O");

        private static byte[] BUFFER = new byte[1024 * 1024];

        private String mPlaintextPath = null;

        private Extractor mExtractor = null;

        private final Map<Text, Text> mRelationNameLookup = new HashMap<Text, Text>();

        private final Map<Text, Text[]> mArgNames = new HashMap<Text, Text[]>();
        private final Map<Text, Text[]> mArgTypes = new HashMap<Text, Text[]>();
        private final Map<Text, Text[]> mArgClasses = new HashMap<Text, Text[]>();

        private final Map<Text, Text> mPrimaryClasses = new HashMap<Text, Text>();
        private final Map<Text, List<IdWeightPair>> mPatterns = new HashMap<Text, List<IdWeightPair>>();
        private final Map<Text, List<IdWeightPair>> mInstances = new HashMap<Text, List<IdWeightPair>>();

        // maps relations to their confidence scores
        private final Map<Relation, Double> mRelations = new HashMap<Relation, Double>();

        // maps individual ids to their representations
        private final Map<Text, Individual> mIndividuals = new HashMap<Text, Individual>();

        private final SentenceSegmentedDocument<TratzParsedTokenWritable> mDoc = new SentenceSegmentedDocument<TratzParsedTokenWritable>(
                TOKEN_FACTORY);

        private final Text mKey = new Text();
        private final Text mValue = new Text();

        private final ContextPatternWritable mPair = new ContextPatternWritable();

        //      private final Map<IntPair, Integer> mCorefForwardLookup = new HashMap<IntPair, Integer>();
        //      private final Map<Integer, List<IntPair>> mCorefReverseLookup = new HashMap<Integer, List<IntPair>>();

        private void loadTypes(String typesPath, Configuration conf) throws IOException {
            // reset relation name lookup
            mRelationNameLookup.clear();

            // clear argument names
            mArgNames.clear();

            // clear argument types
            mArgTypes.clear();

            // clear argument classes
            mArgClasses.clear();

            BufferedReader reader = MavunoUtils.getBufferedReader(conf, typesPath);

            // read types
            String input;
            while ((input = reader.readLine()) != null) {
                String[] cols = input.split("\t");

                if (cols.length < 5 || (cols.length - 2) % 3 != 0) {
                    throw new RuntimeException("Ill-formed line in types file -- " + input);
                }

                Text relationId = new Text(cols[0]);
                Text relationName = new Text(cols[1]);

                mRelationNameLookup.put(relationId, relationName);

                Text[] argNames = new Text[(cols.length - 2) / 3];
                Text[] argTypes = new Text[(cols.length - 2) / 3];
                Text[] argClasses = new Text[(cols.length - 2) / 3];

                for (int i = 2; i < cols.length; i += 3) {
                    argNames[(i - 2) / 3] = new Text(cols[i]);
                    argTypes[(i - 2) / 3] = new Text(cols[i + 1]);
                    argClasses[(i - 2) / 3] = new Text(cols[i + 2]);
                }

                mArgNames.put(relationId, argNames);
                mArgTypes.put(relationId, argTypes);
                mArgClasses.put(relationId, argClasses);
            }

            // close current reader
            reader.close();
        }

        private static void loadPatterns(Map<Text, List<IdWeightPair>> patterns, String patternsPath,
                Configuration conf) throws IOException {
            // clear example lookup
            patterns.clear();

            BufferedReader reader = MavunoUtils.getBufferedReader(conf, patternsPath);

            // read patterns
            String input;
            while ((input = reader.readLine()) != null) {
                String[] cols = input.split("\t");

                if (cols.length < 2 || cols.length > 3) {
                    throw new RuntimeException("Ill-formed line in pattern file -- " + input);
                }

                Text relationName = new Text(cols[0]);
                Text pattern = new Text(cols[1]);

                float weight = 1.0f;
                if (cols.length == 3) {
                    weight = Float.parseFloat(cols[2]);
                }

                IdWeightPair pair = new IdWeightPair(relationName, weight);

                updatePatternMap(patterns, pattern, pair);
            }

            // close current reader
            reader.close();
        }

        private static void updatePatternMap(Map<Text, List<IdWeightPair>> patterns, Text pattern,
                IdWeightPair pair) {
            // populate pattern lookup
            List<IdWeightPair> contextList = null;
            contextList = patterns.get(pattern);
            if (contextList == null) {
                contextList = new ArrayList<IdWeightPair>(1);
                contextList.add(pair);
                patterns.put(pattern, contextList);
            } else {
                contextList.add(pair);
            }
        }

        @Override
        public void setup(
                Mapper<Writable, SentenceSegmentedDocument<TratzParsedTokenWritable>, Text, Text>.Context context)
                throws IOException {
            Configuration conf = context.getConfiguration();

            try {
                // initialize extractor
                mExtractor = (Extractor) Class.forName(conf.get("Mavuno.ExtractRelations.ExtractorClass"))
                        .newInstance();
                String contextArgs = conf.get("Mavuno.ExtractRelations.ExtractorArgs", null);
                mExtractor.initialize(contextArgs, conf);

                // load types
                String typesPath = conf.get("Mavuno.ExtractRelations.TypesPath", null);
                loadTypes(typesPath, conf);

                // get primary types
                String[] primaryTypes = conf.get("Mavuno.ExtractRelations.PrimaryTypes", "").split(",");
                mPrimaryClasses.clear();
                for (int i = 0; i < primaryTypes.length; i++) {
                    String[] pair = primaryTypes[i].split("\\|");
                    if (pair.length != 2) {
                        throw new RuntimeException("Illegal primary type specification -- " + primaryTypes[i]);
                    }
                    mPrimaryClasses.put(new Text(pair[0]), new Text(pair[1]));
                }

                // load extraction patterns
                String patternsPath = conf.get("Mavuno.ExtractRelations.PatternsPath", null);
                loadPatterns(mPatterns, patternsPath, conf);

                // load instances (if provided)
                String instancesPath = conf.get("Mavuno.ExtractRelations.InstancesPath", null);
                if (instancesPath != null) {
                    loadPatterns(mInstances, instancesPath, conf);
                }

                // set plaintext corpus path location (if set)
                mPlaintextPath = conf.get("Mavuno.ExtractRelations.PlaintextPath", null);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        @Override
        public void map(Writable key, SentenceSegmentedDocument<TratzParsedTokenWritable> doc,
                Mapper<Writable, SentenceSegmentedDocument<TratzParsedTokenWritable>, Text, Text>.Context context)
                throws IOException, InterruptedException {
            // key = document id
            mKey.set(doc.getDocid());

            sLogger.info("Processing document: " + doc.getDocid());

            // get sentences
            List<SentenceWritable<TratzParsedTokenWritable>> sentences = doc.getSentences();

            int sentId = 0;
            //int tokenId = 0;

            //         // get coref clusters
            //         mCorefForwardLookup.clear();
            //         mCorefReverseLookup.clear();
            //
            //         for(SentenceWritable<TratzParsedTokenWritable> s : sentences) {
            //            // reset token id
            //            tokenId = 0;
            //
            //            for(TratzParsedTokenWritable t : s.getTokens()) {
            //               int id = t.getCorefId();
            //               if(id != -1) {
            //                  // position within the document
            //                  IntPair pos = new IntPair(sentId, tokenId);
            //
            //                  // forward lookup
            //                  mCorefForwardLookup.put(pos, id);
            //
            //                  // reverse lookup
            //                  List<IntPair> pairs = mCorefReverseLookup.get(id);
            //                  if(pairs == null) {
            //                     pairs = new ArrayList<IntPair>();
            //                     pairs.add(pos);
            //                     mCorefReverseLookup.put(id, pairs);
            //                  }
            //                  else {
            //                     pairs.add(pos);
            //                  }
            //               }
            //               tokenId++;
            //            }
            //            sentId++;
            //         }

            // clear relations
            mRelations.clear();

            // clear individuals
            mIndividuals.clear();

            // extract separately from each sentence
            sentId = 0;
            //tokenId = 0;
            for (SentenceWritable<TratzParsedTokenWritable> s : sentences) {
                // construct new document that only contains this sentence
                mDoc.clear();
                mDoc.addSentence(s);

                // set current document
                mExtractor.setDocument(mDoc);

                //sLogger.info("SENTENCE = " + s);

                String sentenceText = s.toStringOfTokens();

                // skip empty sentences
                if (sentenceText.length() == 0) {
                    continue;
                }

                int sentenceCharOffsetBegin = s.getTokenAt(0).getCharOffsetBegin();
                int sentenceCharOffsetEnd = s.getTokenAt(s.getNumTokens() - 1).getCharOffsetEnd();

                List<Text> arguments = new ArrayList<Text>();

                // reset token id
                //tokenId = 0;

                // main extract loop -- extracts instances and relations
                List<IdWeightPair> contextList = null;
                while (mExtractor.getNextPair(mPair)) {
                    // get the context instances for this extraction
                    arguments.clear();
                    String[] args = mPair.getContext().toString().split("\\|");
                    for (int i = 0; i < args.length; i++) {
                        arguments.add(new Text(args[i]));
                    }

                    List<Set<Text>> allArgClasses = new ArrayList<Set<Text>>();
                    List<Integer> allArgCharOffsetBegin = new ArrayList<Integer>();
                    List<Integer> allArgCharOffsetEnd = new ArrayList<Integer>();

                    // process each argument instance
                    for (Text arg : arguments) {
                        // get offset within sentence
                        int argOffset = getOffset(arg, sentenceText);

                        // skip if we can't find an alignment for some reason
                        if (argOffset == -1) {
                            sLogger.warn("Can't find alignment for: " + arg + " in sentence: " + sentenceText);
                            continue;
                        }

                        // argument length
                        int argLength = getLength(arg);

                        // argument char offsets
                        int argCharOffsetBegin = s.getTokenAt(argOffset).getCharOffsetBegin();
                        int argCharOffsetEnd = s.getTokenAt(argOffset + argLength - 1).getCharOffsetEnd();

                        allArgCharOffsetBegin.add(argCharOffsetBegin);
                        allArgCharOffsetEnd.add(argCharOffsetEnd);

                        // get arg classes
                        Set<Text> argClasses = getTypes(sentences, s, sentId, argOffset, argLength, true);

                        allArgClasses.add(argClasses);

                        // is this a mention of a primary class? if so, then process it
                        for (Text argClass : argClasses) {
                            Text argType = mPrimaryClasses.get(argClass);
                            if (argType != null) {
                                Pair<Text, IntPair> individualSpec = resolveCoref(mPair.getContext(), argClass,
                                        sentences, s, argOffset, argLength);
                                Individual individual = mIndividuals.get(individualSpec.first);
                                if (individual == null) {
                                    individual = new Individual(individualSpec.first,
                                            individualSpec.second.getSource(), individualSpec.second.getTarget());
                                    mIndividuals.put(new Text(individualSpec.first), individual);
                                }
                                individual.addOccurrence(new TypedTextSpan(argType, individualSpec.first,
                                        individualSpec.second.getSource(), individualSpec.second.getTarget()));
                            }
                        }
                    }

                    // check if this pattern matches any of the relation patterns
                    contextList = mPatterns.get(mPair.getPattern());

                    // if this pattern doesn't match any of the relation patterns then we're done
                    if (contextList == null) {
                        continue;
                    }

                    // if found, then process
                    for (IdWeightPair pair : contextList) {
                        Text[] expectedNames = mArgNames.get(pair.id);
                        Text[] expectedTypes = mArgTypes.get(pair.id);
                        Text[] expectedClasses = mArgClasses.get(pair.id);

                        // uh oh, we're missing name and/or type information
                        if (expectedNames == null || expectedTypes == null || expectedClasses == null) {
                            throw new RuntimeException("Missing name, type, and/or class information for: " + pair);
                        }

                        // perform length count checking
                        if (expectedClasses.length != expectedNames.length
                                || expectedNames.length != expectedTypes.length
                                || expectedTypes.length != allArgClasses.size()) {
                            sLogger.warn("Argument length mismatch for: " + pair + " -- skipping!");
                            continue;
                        }

                        // perform class type checking
                        boolean matches = true;
                        for (int i = 0; i < expectedClasses.length; i++) {
                            if (!allArgClasses.get(i).contains(expectedClasses[i])) {
                                matches = false;
                                break;
                            }
                        }

                        if (matches) {
                            // build relation
                            Relation r = new Relation(mRelationNameLookup.get(pair.id), MAVUNO_SOURCE,
                                    sentenceCharOffsetBegin, sentenceCharOffsetEnd);
                            for (int i = 0; i < arguments.size(); i++) {
                                // argument text
                                Text argText = arguments.get(i);

                                // argument name
                                Text argName = expectedNames[i];

                                // argument type
                                Text argType = expectedTypes[i];

                                // beginning and end offset for this argument
                                int argBegin = allArgCharOffsetBegin.get(i);
                                int argEnd = allArgCharOffsetEnd.get(i);

                                // find the individual for this argument
                                Individual individual = mIndividuals.get(argText);
                                if (individual == null) { // create new individual
                                    individual = new Individual(argText, argBegin, argEnd);
                                    individual.addOccurrence(new TypedTextSpan(argType, argText, argBegin, argEnd));
                                    mIndividuals.put(new Text(argText), individual);
                                } else {
                                    individual.addOccurrence(new TypedTextSpan(argType, argText, argBegin, argEnd));
                                }

                                // add argument to relation
                                r.addArgument(argName, argText, individual, argBegin, argEnd);
                            }

                            Double confidence = mRelations.get(r);
                            if (confidence == null) {
                                mRelations.put(r, new Double(pair.weight));
                            } else {
                                mRelations.put(r, confidence + pair.weight);
                            }
                        }
                    }

                    //tokenId++;
                }

                sentId++;
            }

            try {
                // read plain text version of document, if necessary
                String documentText = null;
                if (mPlaintextPath != null) {
                    documentText = loadDocumentText(context.getConfiguration(), mPlaintextPath, doc.getDocid());
                }

                // generate XML output
                String xml = getXMLOutput(doc.getDocid(), documentText, mRelations, mIndividuals);
                //System.out.println(xml);
                mValue.set(xml);
                context.write(mKey, mValue);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        private static String loadDocumentText(Configuration conf, String path, String docid) throws IOException {
            Text text = new Text();

            FSDataInputStream reader = MavunoUtils.getFSDataInputStream(conf, path + "/" + docid);

            int n;
            while ((n = reader.read(BUFFER, 0, BUFFER.length)) != -1) {
                text.append(BUFFER, 0, n);
            }

            reader.close();

            return text.toString();
        }

        // creates an XML representation of the relations and individuals
        private static String getXMLOutput(String docid, String docText, Map<Relation, Double> relations,
                Map<Text, Individual> individuals) throws ParserConfigurationException, TransformerException {
            DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder domBuilder = domFactory.newDocumentBuilder();

            Document doc = domBuilder.newDocument();
            Element rootElement = doc.createElement("doc");
            rootElement.setAttribute("xmlns", "http://www.bbn.com/MR/ELF");
            rootElement.setAttribute("id", docid);
            rootElement.setAttribute("elf-version", "2.2");
            rootElement.setAttribute("source", "Mavuno Reader");
            rootElement.setAttribute("contents", "S-ELF");
            doc.appendChild(rootElement);

            Map<Individual, Integer> individualIds = new HashMap<Individual, Integer>();

            int id = 1;
            for (Map.Entry<Text, Individual> indEntry : individuals.entrySet()) {
                Individual ind = indEntry.getValue();
                TypedTextSpan indSpan = ind.getSpan();

                Element indElement = doc.createElement("individual");
                indElement.setAttribute("id", Integer.toString(id));

                Element nameElement = doc.createElement("name");
                if (docText != null) {
                    nameElement.setTextContent(docText.substring(indSpan.start, indSpan.end + 1));
                }
                nameElement.setAttribute("name", indSpan.text.toString());
                nameElement.setAttribute("start", Integer.toString(indSpan.start));
                nameElement.setAttribute("end", Integer.toString(indSpan.end));
                indElement.appendChild(nameElement);

                for (TypedTextSpan occurrence : ind.getOccurrences()) {
                    // handle special case
                    // TODO: make this more modular
                    if (occurrence.type.toString().equals("xsd:string")) {
                        continue;
                    }

                    Element occurrenceElement = doc.createElement("type");
                    if (docText != null) {
                        occurrenceElement.setTextContent(docText.substring(occurrence.start, occurrence.end + 1));
                    }
                    occurrenceElement.setAttribute("type", occurrence.type.toString());
                    occurrenceElement.setAttribute("start", Integer.toString(occurrence.start));
                    occurrenceElement.setAttribute("end", Integer.toString(occurrence.end));
                    indElement.appendChild(occurrenceElement);
                }

                if (indElement.getChildNodes().getLength() > 1) {
                    rootElement.appendChild(indElement);
                    individualIds.put(ind, id);
                    id++;
                }
            }

            for (Map.Entry<Relation, Double> relEntry : relations.entrySet()) {
                Relation rel = relEntry.getKey();
                double confidence = relEntry.getValue();

                // TODO: fix this
                if (confidence > 1.0) {
                    confidence = 1.0;
                }

                Element relationElement = doc.createElement("relation");
                if (docText != null) {
                    Element textElement = doc.createElement("text");
                    textElement.setTextContent(docText.substring(rel.getStartOffset(), rel.getEndOffset() + 1));
                    relationElement.appendChild(textElement);
                }
                relationElement.setAttribute("name", rel.getName().toString());
                relationElement.setAttribute("source", rel.getSource().toString());
                relationElement.setAttribute("start", Integer.toString(rel.getStartOffset()));
                relationElement.setAttribute("end", Integer.toString(rel.getEndOffset()));
                relationElement.setAttribute("p", Double.toString(confidence));

                for (Map.Entry<TypedTextSpan, Individual> argEntry : rel.getArguments().entrySet()) {
                    TypedTextSpan argSpan = argEntry.getKey();
                    Individual argInd = argEntry.getValue();

                    Element argumentElement = doc.createElement("arg");
                    if (docText != null) {
                        argumentElement.setTextContent(docText.substring(argSpan.start, argSpan.end + 1));
                    }
                    argumentElement.setAttribute("role", argSpan.type.toString());
                    argumentElement.setAttribute("start", Integer.toString(argSpan.start));
                    argumentElement.setAttribute("end", Integer.toString(argSpan.end));

                    // handle special case
                    // TODO: make this more modular
                    if (argSpan.type.toString().startsWith("t:")) {
                        argumentElement.setAttribute("type", "xsd:string");
                        argumentElement.setAttribute("value", argSpan.text.toString());
                    } else {
                        int argId = individualIds.get(argInd);
                        argumentElement.setAttribute("id", Integer.toString(argId));
                    }

                    relationElement.appendChild(argumentElement);
                }

                rootElement.appendChild(relationElement);
            }

            TransformerFactory transFactory = TransformerFactory.newInstance();
            Transformer trans = transFactory.newTransformer();
            trans.setOutputProperty(OutputKeys.INDENT, "yes");
            trans.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2");

            StringWriter sw = new StringWriter();
            StreamResult result = new StreamResult(sw);
            DOMSource source = new DOMSource(doc);
            trans.transform(source, result);

            return sw.toString();
        }

        private Pair<Text, IntPair> resolveCoref(Text arg, Text expectedType,
                List<SentenceWritable<TratzParsedTokenWritable>> sentences,
                SentenceWritable<TratzParsedTokenWritable> s, int offset, int length) {
            Pair<Text, IntPair> bestPair = null;
            //         // resolve co-ref to best individual
            //         for(int i = offset; i < offset + length; i++) {
            //            int corefId = s.getTokenAt(i).getCorefId();
            //            List<IntPair> pairs = mCorefReverseLookup.get(corefId);
            //            if(pairs != null) {
            //               for(IntPair p : pairs) {
            //                  IntPair chunkSpan = getChunkSpan(sentences.get(p.getSource()), p.getTarget());
            //                  Set<Text> chunkTypes = getTypes(sentences, sentences.get(p.getSource()), p.getSource(), chunkSpan.getSource(), chunkSpan.getTarget(), false);
            //                  if(chunkTypes.contains(expectedType)) {
            //                     Text chunkText = getSpan(sentences.get(p.getSource()), chunkSpan.getSource(), chunkSpan.getTarget());
            //                     int begin = sentences.get(p.getSource()).getTokenAt(chunkSpan.getSource()).getCharOffsetBegin();
            //                     int end = sentences.get(p.getSource()).getTokenAt(chunkSpan.getSource()+chunkSpan.getTarget()-1).getCharOffsetEnd();
            //                     //System.out.println(arg + " RESOLVES TO " + chunkText + "\t" + begin + "\t" + end);
            //                     if(bestPair == null || chunkText.getLength() > bestPair.first.getLength()) {
            //                        bestPair = new Pair<Text,IntPair>(chunkText, new IntPair(begin, end));
            //                     }
            //                  }
            //               }
            //            }
            //         }
            //
            if (bestPair == null) {
                IntPair chunkSpan = getChunkSpan(s, offset);
                Text chunkText = getSpan(s, chunkSpan.getSource(), chunkSpan.getTarget());
                int begin = s.getTokenAt(chunkSpan.getSource()).getCharOffsetBegin();
                int end = s.getTokenAt(chunkSpan.getSource() + chunkSpan.getTarget() - 1).getCharOffsetEnd();
                bestPair = new Pair<Text, IntPair>(chunkText, new IntPair(begin, end));
            }

            return bestPair;
        }

        private Text getSpan(SentenceWritable<TratzParsedTokenWritable> tokens, int offset, int length) {
            Text span = new Text();

            for (int i = offset; i < offset + length; i++) {
                Text t = tokens.getTokenAt(i).getToken();
                span.append(t.getBytes(), 0, t.getLength());
                if (i != offset + length - 1) {
                    span.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
                }
            }

            return span;
        }

        private IntPair getChunkSpan(SentenceWritable<TratzParsedTokenWritable> tokens, int offset) {
            int beginPos = 0;

            Text curNETag = null;
            for (int i = 0; i < tokens.getNumTokens(); i++) {
                Text neTag = tokens.getTokenAt(i).getNETag();
                Text chunkTag = tokens.getTokenAt(i).getChunkTag();

                // don't split named entities
                if (curNETag == null || (!curNETag.equals(O) && !neTag.equals(curNETag)) || (curNETag.equals(O)
                        && (chunkTag.getBytes()[0] == 'B' || chunkTag.getBytes()[0] == 'O'))) {
                    if (i > offset) {
                        return new IntPair(beginPos, i - beginPos);
                    }
                    curNETag = new Text(neTag);
                    beginPos = i;
                }
            }

            return new IntPair(beginPos, tokens.getNumTokens() - beginPos);
        }

        // TODO: there has to be a better way...
        private int getOffset(Text text, String sentence) {
            String paddedText = " " + text + " ";

            int offset = 0;
            for (int i = 0; i < sentence.length() - text.getLength(); i++) {
                if (sentence.charAt(i) == ' ') {
                    offset++;
                }

                if (i == 0) {
                    paddedText = text + " ";
                } else if (i == 1) {
                    paddedText = " " + text + " ";
                } else if (i == sentence.length() - text.getLength() - 1) {
                    paddedText = " " + text;
                }

                if (sentence.regionMatches(i, paddedText, 0, paddedText.length())) {
                    return offset;
                }
            }
            return -1;
        }

        private int getLength(Text text) {
            int length = 1;
            byte[] b = text.getBytes();
            for (int i = 0; i < text.getLength(); i++) {
                if (b[i] == ' ') {
                    length++;
                }
            }
            return length;
        }

        private Set<Text> getTypes(List<SentenceWritable<TratzParsedTokenWritable>> allSentences,
                SentenceWritable<TratzParsedTokenWritable> curSentence, int sentId, int offset, int length,
                boolean useCoref) {
            Set<Text> types = new HashSet<Text>();
            types.add(DEFAULT_TYPE);

            // get user-specified types (if specified)
            Set<IdWeightPair> instanceClasses = getMatchingInstances(curSentence, offset, length);
            if (instanceClasses != null) {
                for (IdWeightPair instanceClass : instanceClasses) {
                    //System.out.println("Instance is of type: " + instanceClass.id);
                    types.add(instanceClass.id);
                }
            }

            // get NE types
            for (int i = offset; i < offset + length; i++) {
                Text type = curSentence.getTokenAt(i).getNETag();
                //System.out.println(curSentence.getTokenAt(i).getToken() + " is of type: " + type);
                types.add(type);
            }

            //         // get types (via coref)
            //         if(useCoref) {
            //            for(int i = offset; i < offset + length; i++) {
            //               IntPair pair = new IntPair(sentId, i);
            //               Integer corefId = mCorefForwardLookup.get(pair);
            //               if(corefId != null) {
            //                  List<IntPair> pairs = mCorefReverseLookup.get(corefId);
            //                  for(IntPair p : pairs) {
            //                     Text type = allSentences.get(p.getSource()).getTokenAt(p.getTarget()).getNETag();
            //                     //System.out.println(curSentence.getTokenAt(i).getToken() + " is of type: " + type + " [via coref!]");
            //                     types.add(type);
            //
            //                     instanceClasses = getMatchingInstances(allSentences.get(p.getSource()), p.getTarget(), 1);
            //                     for(IdWeightPair instanceClass : instanceClasses) {
            //                        //System.out.println("Instance is of type: " + instanceClass.id + " [via coref!]");
            //                        types.add(instanceClass.id);
            //                     }
            //                  }
            //               }
            //            }
            //         }

            return types;
        }

        private Set<IdWeightPair> getMatchingInstances(SentenceWritable<TratzParsedTokenWritable> sentence,
                int offset, int length) {
            if (mInstances.size() == 0) {
                return null;
            }

            // get tokens
            Text[] tokens = new Text[length];
            for (int i = offset; i < offset + length; i++) {
                tokens[i - offset] = new Text(sentence.getTokenAt(i).getToken().toString());
            }

            Set<IdWeightPair> matches = new HashSet<IdWeightPair>();

            Text pattern = new Text();
            for (int s = 0; s < length; s++) {
                for (int l = 1; l < length; l++) {
                    if (s + l > length) {
                        continue;
                    }

                    pattern.clear();
                    for (int pos = s; pos < s + l; pos++) {
                        pattern.append(tokens[pos].getBytes(), 0, tokens[pos].getLength());
                        if (pos != s + l - 1) {
                            pattern.append(MavunoUtils.SPACE_BYTES, 0, MavunoUtils.SPACE_BYTES_LENGTH);
                        }
                    }

                    List<IdWeightPair> list = mInstances.get(pattern);
                    if (list != null) {
                        //System.out.println("INSTANCE FOUND: " + pattern);
                        matches.addAll(list);
                    }
                }
            }

            return matches;
        }

    }

    private static class MyReducer extends Reducer<Text, Text, Text, Text> {

        @Override
        public void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context)
                throws IOException, InterruptedException {
            Configuration conf = context.getConfiguration();
            String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.OutputPath", conf);
            BufferedWriter writer = MavunoUtils.getBufferedWriter(conf, outputPath + "/" + key + ".xml");
            for (Text value : values) {
                writer.write(value.toString());
            }
            writer.close();
        }
    }

    /* (non-Javadoc)
     * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
     */
    @Override
    public int run(String[] args) throws ClassNotFoundException, InterruptedException, IOException {
        MavunoUtils.readParameters(args, "Mavuno.ExtractRelations", getConf());
        return run();
    }

    public int run() throws ClassNotFoundException, InterruptedException, IOException {
        Configuration conf = getConf();

        String typesPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.TypesPath", conf);
        String primaryTypes = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PrimaryTypes", conf);
        String patternsPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.PatternsPath", conf);
        String instancesPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.InstancesPath", conf);
        String plaintextPath = MavunoUtils.getOptionalParam("Mavuno.ExtractRelations.PlaintextPath", conf);
        String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.CorpusPath", conf);
        String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorClass", conf);
        String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.ExtractorArgs", conf);
        String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractRelations.OutputPath", conf);

        sLogger.info("Tool name: ExtractRelations");
        sLogger.info(" - Types path: " + typesPath);
        sLogger.info(" - Primary types: " + primaryTypes);
        sLogger.info(" - Patterns path: " + patternsPath);
        if (instancesPath != null) {
            sLogger.info(" - Instances path: " + instancesPath);
        }
        if (plaintextPath != null) {
            sLogger.info(" - Plaintext path: " + plaintextPath);
        }
        sLogger.info(" - Corpus path: " + corpusPath);
        sLogger.info(" - Extractor class: " + extractorClass);
        sLogger.info(" - Extractor arguments: " + extractorArgs);
        sLogger.info(" - Output path: " + outputPath);

        Job job = new Job(conf);
        job.setJobName("ExtractRelations");

        FileInputFormat.addInputPath(job, new Path(corpusPath));
        FileOutputFormat.setOutputPath(job, new Path(outputPath));

        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.waitForCompletion(true);
        return 0;
    }

    /**
     * @param args
     * @throws Exception 
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        int res = ToolRunner.run(new ExtractRelations(conf), args);
        System.exit(res);
    }
}