Java tutorial
/* * Copyright (C) 2016 Raytheon BBN Technologies Corp. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package adept.io; import java.io.*; import java.net.URISyntaxException; import java.net.URL; import java.net.URLDecoder; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.commons.io.input.BOMInputStream; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; import adept.common.Document; import adept.common.OntTypeFactory; import adept.common.TokenStream; import adept.common.Token; import adept.common.TokenOffset; import adept.common.Type; import adept.common.HltContentContainer; import adept.common.Entity; import adept.common.EntityMention; import adept.common.Event; import adept.common.EventRelations; import adept.common.Relation; import adept.common.Argument; import adept.common.Coreference; import adept.common.Chunk; import adept.common.Pair; import adept.common.PartOfSpeech; import adept.common.Passage; import adept.common.Sentence; import adept.utilities.DocumentMaker; import adept.utilities.PassageAttributes; import java.util.List; import java.io.FileReader; import java.util.LinkedList; import java.util.Enumeration; import java.util.HashSet; import java.util.ArrayList; import java.util.Set; import java.util.HashMap; import java.util.Map; import java.util.jar.JarEntry; import java.util.jar.JarFile; //import org.w3c.dom.Document; //conflicts with adept.common.Document import org.w3c.dom.NodeList; import org.w3c.dom.Element; /** * The Class Reader. */ public class Reader { /** * The instance. */ private static Reader instance; /** * Gets the single instance of Reader. * * @return single instance of Reader */ public static Reader getInstance() { if (instance == null) instance = new Reader(); return instance; } /** * Reads specified XML file into a DOM object. * * @param path the path * @return the document */ public org.w3c.dom.Document readXML(String path) { try { InputStream is = findStreamInClasspathOrFileSystem(path); BOMInputStream bis = new BOMInputStream(is, false); DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); org.w3c.dom.Document doc = dBuilder.parse(bis); // TODO - might have backslashes. doc.setDocumentURI(path); return doc; } catch (Exception e) { //e.printStackTrace(); } return null; } /** * Reads a Conversation document in JSON format as specified by IHMC. * * @param path the path * @param utterances the utterances * @param speakers the speakers * @param title the title * @return the string */ public String readConversationFile(String path, List<String> utterances, List<String> speakers, String title) { try { JSONParser parser = new JSONParser(); Object obj = parser.parse(new FileReader(path)); JSONObject jsonObject = (JSONObject) obj; JSONObject conversation = (JSONObject) jsonObject.get("conversation"); title = (String) conversation.get("name"); //System.out.println("Title is: " + title); StringBuffer conversationText = new StringBuffer(); JSONArray utt = (JSONArray) conversation.get("utterances"); if (utt != null) { for (Object o : utt) { JSONObject utterance = (JSONObject) o; speakers.add((String) utterance.get("name")); String uttText = (String) utterance.get("utterance"); utterances.add(uttText); conversationText.append(uttText + "\n"); } } return conversationText.toString(); } catch (Exception e) { e.printStackTrace(); } return null; } /** * Reads specified ERE file into an EREDocument (Document object wrapper). * * @param path Path to ere file to read. * @param docId the doc id * @return an EREDocument with fields derived from file at path. */ public EREDocument readEREFile(String path, String docId, String language) { String filestring = readFileIntoString(path); HltContentContainer hltcc = new HltContentContainer(); EREDocument ereDoc = new EREDocument(filestring, hltcc); ereDoc.createCharMapping(); ereDoc.createDocument(docId, language, path); return ereDoc; } /** * Gets the coreferences. * * @param ereDoc the ere doc * @param xmlDoc the xml doc * @param emList the em list * @return the coreferences */ private List<Coreference> getCoreferences(EREDocument ereDoc, org.w3c.dom.Document xmlDoc, List<EntityMention> emList) { List<Coreference> coreferenceList = new ArrayList<Coreference>(); List<Entity> entitiesList = new ArrayList<Entity>(); List<EntityMention> resolvedEntityMentions = emList; String coreferenceIdString = xmlDoc.getElementsByTagName("deft_ere").item(0).getAttributes() .getNamedItem("docid").getNodeValue(); long coreferenceId = 0; try //this'll work for ids like 'fbis_eng_20010901.0003. If it fails, we have guid, so handle that instead { coreferenceId = Long.parseLong(coreferenceIdString.replaceAll("[^\\d]", "")); } catch (NumberFormatException e) { coreferenceId = Long.parseLong(coreferenceIdString.replaceAll("[^\\d]", "").substring(0, 10));//TODO // coreferenceId = UUID.fromString(coreferenceIdString).getLeastSignificantBits(); } Coreference cr = new Coreference(coreferenceId); NodeList entities = xmlDoc.getElementsByTagName("entity"); for (int x = 0; x < entities.getLength(); x++) { String entityIdString = entities.item(x).getAttributes().getNamedItem("id").getNodeValue(); long id = Long.parseLong(entityIdString.replaceAll("ent-", "")); EntityMention canonicalMention = ereDoc.getCanonicalEntityMentionById(id); String entityType = entities.item(x).getAttributes().getNamedItem("type").getNodeValue(); Entity e = new Entity(id, new Type(entityType)); if (canonicalMention != null) e.setCanonicalMentions(canonicalMention); else //if this entity does not figure into any relation { String emString = ((Element) entities.item(x)).getElementsByTagName("entity_mention").item(0) .getAttributes().getNamedItem("id").getNodeValue(); long emId = Long.parseLong(emString.replaceAll("m-", "")); //TODO because of issue in Discussion Forum data, added this try catch try { e.setCanonicalMentions(ereDoc.getEntityMentionById(emId)); } catch (NullPointerException ex) { System.out.println("can't add: " + emId); } } entitiesList.add(e); } cr.setEntities(entitiesList); cr.setResolvedMentions(resolvedEntityMentions); coreferenceList.add(cr); return coreferenceList.isEmpty() ? null : coreferenceList; } /** * Gets the coreferences. * * @param adeptDoc the ere doc * @param xmlDoc the xml doc * @param emList the em list * @return the coreferences */ private List<Coreference> getCoreferences(Document adeptDoc, org.w3c.dom.Document xmlDoc, List<EntityMention> emList) { List<Coreference> coreferenceList = new ArrayList<Coreference>(); List<Entity> entitiesList = new ArrayList<Entity>(); List<EntityMention> resolvedEntityMentions = emList; Map<Long, EntityMention> mentionsById = new HashMap<Long, EntityMention>(); for (EntityMention mention : emList) { mentionsById.put(mention.getSequenceId(), mention); } String coreferenceIdString = xmlDoc.getElementsByTagName("deft_ere").item(0).getAttributes() .getNamedItem("docid").getNodeValue(); long coreferenceId = Long.parseLong(coreferenceIdString.replaceAll("[^\\d]", "")); Coreference cr = new Coreference(coreferenceId); NodeList entities = xmlDoc.getElementsByTagName("entity"); for (int x = 0; x < entities.getLength(); x++) { String entityIdString = entities.item(x).getAttributes().getNamedItem("id").getNodeValue(); long id = Long.parseLong(entityIdString.replaceAll("ent-", "")); String entityType = entities.item(x).getAttributes().getNamedItem("type").getNodeValue(); Entity e = new Entity(id, new Type(entityType)); String emString = ((Element) entities.item(x)).getElementsByTagName("entity_mention").item(0) .getAttributes().getNamedItem("id").getNodeValue(); long emId = Long.parseLong(emString.replaceAll("m-", "")); e.setCanonicalMentions(mentionsById.get(emId)); entitiesList.add(e); } cr.setEntities(entitiesList); cr.setResolvedMentions(resolvedEntityMentions); coreferenceList.add(cr); return coreferenceList.isEmpty() ? null : coreferenceList; } /** * Gets the relations. * * @param ereDoc the ere doc * @param xmlDoc the xml doc * @return the relations */ private List<Relation> getRelations(EREDocument ereDoc, org.w3c.dom.Document xmlDoc) { List<Relation> relationList = new ArrayList<Relation>(); long relationId = 0; NodeList relation_mentions = xmlDoc.getElementsByTagName("relation_mention"); for (int x = 0; x < relation_mentions.getLength(); x++) { String relationType = relation_mentions.item(x).getParentNode().getAttributes().getNamedItem("type") .getNodeValue(); String relationSubtype = relation_mentions.item(x).getParentNode().getAttributes() .getNamedItem("subtype").getNodeValue(); float confidence = 1; Relation relation = new Relation(relationId++, new Type(relationType + ":" + relationSubtype)); relation.setConfidence(confidence); NodeList arguments = ((Element) relation_mentions.item(x)).getElementsByTagName("arg"); boolean redundant = false; //check for relations between mentions of same entity long prevEntityId = -1; for (int y = 0; y < arguments.getLength(); y++) { String argumentType = arguments.item(y).getAttributes().getNamedItem("type").getNodeValue(); int distributionSize = 1; Argument argument = new Argument(new Type(argumentType), distributionSize); String emIdString = arguments.item(y).getAttributes().getNamedItem("entity_mention_id") .getNodeValue(); long emId = Long.parseLong(emIdString.replaceAll("m-", "")); String entityIdString = arguments.item(y).getAttributes().getNamedItem("entity_id").getNodeValue(); long entityId = Long.parseLong(entityIdString.replaceAll("ent-", "")); if (entityId == prevEntityId) { redundant = true; break; } //sets entity mention id of this argument to that of canonical if it exists //otherwise, sets canonical mention to this mention //emId = ereDoc.putCanonicalEntityMentionById(entityId,emId).getSequenceId(); Chunk distribution = ereDoc.getEntityMentionById(emId); Pair<Chunk, Float> argumentDistribution = new Pair<Chunk, Float>(distribution, 1F); argument.addArgumentConfidencePair(distribution, confidence); relation.addArgument(argument); prevEntityId = entityId; } if (!redundant) relationList.add(relation); } return relationList.isEmpty() ? null : relationList; } /** * Gets the relations. * * @param adeptDoc the adept doc * @param xmlDoc the xml doc * @return the relations */ private List<Relation> getRelations(Document adeptDoc, org.w3c.dom.Document xmlDoc, List<EntityMention> entityMentions) { List<Relation> relationList = new ArrayList<Relation>(); long relationId = 0; NodeList relation_mentions = xmlDoc.getElementsByTagName("relation_mention"); for (int x = 0; x < relation_mentions.getLength(); x++) { String relationType = relation_mentions.item(x).getParentNode().getAttributes().getNamedItem("type") .getNodeValue(); String relationSubtype = relation_mentions.item(x).getParentNode().getAttributes() .getNamedItem("subtype").getNodeValue(); float confidence = 1; Relation relation = new Relation(relationId++, new Type(relationType + ":" + relationSubtype)); relation.setConfidence(confidence); NodeList arguments = ((Element) relation_mentions.item(x)).getElementsByTagName("arg"); boolean redundant = false; //check for relations between mentions of same entity long prevEntityId = -1; for (int y = 0; y < arguments.getLength(); y++) { String argumentType = arguments.item(y).getAttributes().getNamedItem("type").getNodeValue(); int distributionSize = 1; Argument argument = new Argument(new Type(argumentType), distributionSize); String emIdString = arguments.item(y).getAttributes().getNamedItem("entity_mention_id") .getNodeValue(); long emId = Long.parseLong(emIdString.replaceAll("m-", "")); String entityIdString = arguments.item(y).getAttributes().getNamedItem("entity_id").getNodeValue(); long entityId = Long.parseLong(entityIdString.replaceAll("ent-", "")); if (entityId == prevEntityId) { redundant = true; break; } //sets entity mention id of this argument to that of canonical if it exists //otherwise, sets canonical mention to this mention //emId = ereDoc.putCanonicalEntityMentionById(entityId,emId).getSequenceId(); Map<Long, EntityMention> mentionsById = new HashMap<Long, EntityMention>(); for (EntityMention mention : entityMentions) { mentionsById.put(mention.getSequenceId(), mention); } Chunk distribution = mentionsById.get(emId); Pair<Chunk, Float> argumentDistribution = new Pair<Chunk, Float>(distribution, 1F); argument.addArgumentConfidencePair(distribution, confidence); relation.addArgument(argument); prevEntityId = entityId; } if (!redundant) relationList.add(relation); } return relationList.isEmpty() ? null : relationList; } /** * Gets the doc id. * * @param xmlDoc the xml doc * @return the doc id */ private String getDocId(org.w3c.dom.Document xmlDoc) { List<EntityMention> entityMentionList = new ArrayList<EntityMention>(); NodeList deft_eres = xmlDoc.getElementsByTagName("deft_ere"); for (int x = 0; x < deft_eres.getLength(); x++) { return deft_eres.item(x).getAttributes().getNamedItem("docid").getNodeValue(); } return "none"; } /** * Gets the entity mentions. * * @param ereDoc the ere doc * @param xmlDoc the xml doc * @return the entity mentions */ private List<EntityMention> getEntityMentions(EREDocument ereDoc, org.w3c.dom.Document xmlDoc) { List<EntityMention> entityMentionList = new ArrayList<EntityMention>(); NodeList entity_mentions = xmlDoc.getElementsByTagName("entity_mention"); for (int x = 0; x < entity_mentions.getLength(); x++) { String entityType = entity_mentions.item(x).getParentNode().getAttributes().getNamedItem("type") .getNodeValue(); if (entityType.equals("NA")) entityType = "UNKNOWN"; String entityIdString = entity_mentions.item(x).getParentNode().getAttributes().getNamedItem("id") .getNodeValue(); long entityId = Long.parseLong(entityIdString.replaceAll("ent-", "")); String mentionType = entity_mentions.item(x).getAttributes().getNamedItem("noun_type").getNodeValue(); String idString = entity_mentions.item(x).getAttributes().getNamedItem("id").getNodeValue(); long sequenceId = Long.parseLong(idString.replaceAll("m-", "")); long offset = Long .parseLong(entity_mentions.item(x).getAttributes().getNamedItem("offset").getNodeValue()); long length = Long .parseLong(entity_mentions.item(x).getAttributes().getNamedItem("length").getNodeValue()); long tokenStart = ereDoc.getTokenOffset(ereDoc.getCalculatedOffset(offset)); long tokenEnd = ereDoc.getTokenOffset(ereDoc.getCalculatedOffset(offset + length - 1)); TokenOffset to = new TokenOffset((int) tokenStart, (int) tokenEnd); if (tokenStart == -1 || tokenEnd == -1) { System.out.println((int) offset + " " + (int) (offset + length)); System.out.println(offset + " Entity mention token range " + tokenStart + " to " + tokenEnd + " is out of bounds: " + ereDoc.getFullText().substring((int) offset, (int) (offset + length))); //TODO return null, don't continue continue; // return null; } EntityMention em = new EntityMention(sequenceId, to, ereDoc.getDocument().getTokenStreamList().get(0)); em.setMentionType(OntTypeFactory.getInstance().getType("MENTION_ERE", mentionType)); em.setEntityType(new Type(entityType)); HashMap<Long, Float> entityIdDistribution = new HashMap<Long, Float>(); entityIdDistribution.put(entityId, 1F); em.setEntityIdDistribution(entityIdDistribution); ereDoc.getEntityMentionsById().put(sequenceId, em); entityMentionList.add(em); } return entityMentionList.isEmpty() ? null : entityMentionList; } /** * Gets the entity mentions. * * @param adeptDoc the ere doc * @param xmlDoc the xml doc * @return the entity mentions */ private List<EntityMention> getEntityMentions(Document adeptDoc, org.w3c.dom.Document xmlDoc) { List<EntityMention> entityMentionList = new ArrayList<EntityMention>(); NodeList entity_mentions = xmlDoc.getElementsByTagName("entity_mention"); TokenStream tokenStream = adeptDoc.getDefaultTokenStream(); for (int x = 0; x < entity_mentions.getLength(); x++) { String entityType = entity_mentions.item(x).getParentNode().getAttributes().getNamedItem("type") .getNodeValue(); if (entityType.equals("NA")) entityType = "UNKNOWN"; String entityIdString = entity_mentions.item(x).getParentNode().getAttributes().getNamedItem("id") .getNodeValue(); long entityId = Long.parseLong(entityIdString.replaceAll("ent-", "")); String mentionType = entity_mentions.item(x).getAttributes().getNamedItem("noun_type").getNodeValue(); String idString = entity_mentions.item(x).getAttributes().getNamedItem("id").getNodeValue(); long sequenceId = Long.parseLong(idString.replaceAll("m-", "")); long offset = Long .parseLong(entity_mentions.item(x).getAttributes().getNamedItem("offset").getNodeValue()); long length = Long .parseLong(entity_mentions.item(x).getAttributes().getNamedItem("length").getNodeValue()); long tokenStart = -1; long tokenEnd = 10000000; boolean exactStart = false; boolean exactEnd = false; for (Token token : tokenStream) { // System.out.println("Testing token: " + token.getSequenceId() + token.getValue() + " " + token.getCharOffset().getBegin() + " " + token.getCharOffset().getEnd()); if (!exactStart && token.getCharOffset().getBegin() == offset) { tokenStart = token.getSequenceId(); } else if (!exactStart && token.getCharOffset().getBegin() < offset && token.getCharOffset().getBegin() > tokenStart) { tokenStart = token.getSequenceId(); } if (!exactEnd && token.getCharOffset().getEnd() == offset + length) { tokenEnd = token.getSequenceId(); } else if (!exactEnd && token.getCharOffset().getEnd() > offset + length && token.getCharOffset().getEnd() < tokenEnd) { tokenEnd = token.getSequenceId(); } } long endOffset = offset + length - 1; TokenOffset to = new TokenOffset((int) tokenStart, (int) tokenEnd); if (tokenStart == -1 || tokenEnd == 10000000) { System.out.println("Token range " + tokenStart + " to " + tokenEnd + " is out of bounds."); return null; } EntityMention em = new EntityMention(sequenceId, to, tokenStream); em.setMentionType(OntTypeFactory.getInstance().getType("MENTION_ERE", mentionType)); em.setEntityType(new Type(entityType)); HashMap<Long, Float> entityIdDistribution = new HashMap<Long, Float>(); entityIdDistribution.put(entityId, 1F); em.setEntityIdDistribution(entityIdDistribution); entityMentionList.add(em); } return entityMentionList.isEmpty() ? null : entityMentionList; } /** * Gets the named entities. * * @param entityMentions the entity mentions * @return the named entities */ private List<EntityMention> getNamedEntities(List<EntityMention> entityMentions) { List<EntityMention> namedEntities = new ArrayList<EntityMention>(); for (EntityMention em : entityMentions) { String mentionType = em.getMentionType().getType(); String entityType = em.getEntityType().getType(); if (mentionType.equals("NAM") && (entityType.equals("GPE") || entityType.equals("ORG") || entityType.equals("PER"))) namedEntities.add(em); } return namedEntities; } /** * Gets the event relations. * * @param ereDoc the ere doc * @return the event relations */ private List<EventRelations> getEventRelations(EREDocument ereDoc) { List<EventRelations> eventRelations = new ArrayList<EventRelations>(); for (long id : ereDoc.getEventsById().keySet()) { EventRelations er = new EventRelations(); er.setCoreferences(ereDoc.getEventById(id)); eventRelations.add(er); } return eventRelations.isEmpty() ? null : eventRelations; } /** * Gets the events. * * @param ereDoc the ere doc * @param xmlDoc the xml doc * @return the events */ private List<Event> getEvents(EREDocument ereDoc, org.w3c.dom.Document xmlDoc) { List<Event> eventList = new ArrayList<Event>(); NodeList evms = xmlDoc.getElementsByTagName("event_mention"); for (int x = 0; x < evms.getLength(); x++) { String eventIdString = evms.item(x).getParentNode().getAttributes().getNamedItem("id").getNodeValue(); String evmIdString = evms.item(x).getAttributes().getNamedItem("id").getNodeValue(); long eventId = Long.parseLong(eventIdString.replaceAll("e-", "")); long evmId = Long.parseLong(evmIdString.replaceAll("evm-", "")); String evmType = evms.item(x).getAttributes().getNamedItem("type").getNodeValue(); String evmSubtype = evms.item(x).getAttributes().getNamedItem("subtype").getNodeValue(); //create new Event from this event_mention Event e = new Event(eventId, OntTypeFactory.getInstance().getType("EVENT_ERE", evmType + "." + evmSubtype)); if (e.getType().contains("OTH")) System.out.println("OTHER:" + evmType + "." + evmSubtype); NodeList triggers = ((Element) evms.item(x)).getElementsByTagName("trigger"); for (int y = 0; y < triggers.getLength(); y++) { String argType = "trigger"; long offset = Long .parseLong(triggers.item(y).getAttributes().getNamedItem("offset").getNodeValue()); long length = Long .parseLong(triggers.item(y).getAttributes().getNamedItem("length").getNodeValue()); TokenOffset to = tokenOffsetFromOffset(offset, length, ereDoc); if (to == null) return null; //add this as an argument, its type to e's attributes list Chunk trigger = new Chunk(to, ereDoc.getDocument().getTokenStreamList().get(0)); Argument argument = new Argument(OntTypeFactory.getInstance().getType("EVENT_ERE", argType), 1); argument.addArgumentConfidencePair(trigger, 1.0F); e.addArgument(argument); } NodeList args = ((Element) evms.item(x)).getElementsByTagName("arg"); for (int y = 0; y < args.getLength(); y++) { String argType = args.item(y).getAttributes().getNamedItem("type").getNodeValue(); String emIdString = args.item(y).getAttributes().getNamedItem("entity_mention_id").getNodeValue(); long emId = Long.parseLong(emIdString.replaceAll("m-", "")); //add this as an argument, its type to e's attributes list EntityMention em = ereDoc.getEntityMentionById(emId); Argument argument = new Argument(new Type(argType), 1); argument.addArgumentConfidencePair(em, 1.0F); e.addArgument(argument); } NodeList places = ((Element) evms.item(x)).getElementsByTagName("place"); for (int y = 0; y < places.getLength(); y++) { String argType = "place"; String emIdString = places.item(y).getAttributes().getNamedItem("entity_mention_id").getNodeValue(); long emId = Long.parseLong(emIdString.replaceAll("m-", "")); //add this as an argument, its type to e's attributes list EntityMention em = ereDoc.getEntityMentionById(emId); Argument argument = new Argument(OntTypeFactory.getInstance().getType("EVENT_ERE", argType), 1); argument.addArgumentConfidencePair(em, 1.0F); e.addArgument(argument); } NodeList dates = ((Element) evms.item(x)).getElementsByTagName("date"); for (int y = 0; y < dates.getLength(); y++) { String argType = "date"; NodeList date_extents = ((Element) dates.item(y)).getElementsByTagName("date_extent"); for (int z = 0; z < date_extents.getLength(); z++) { long offset = 0; long length = 0; try { offset = Long.parseLong( date_extents.item(z).getAttributes().getNamedItem("offset").getNodeValue()); length = Long.parseLong( date_extents.item(z).getAttributes().getNamedItem("length").getNodeValue()); } catch (NumberFormatException nfe) { continue; } TokenOffset to = tokenOffsetFromOffset(offset, length, ereDoc); if (to == null) return null; //add this as an argument, its type to e's attributes list Chunk date = new Chunk(to, ereDoc.getDocument().getTokenStreamList().get(0)); Argument argument = new Argument(OntTypeFactory.getInstance().getType("EVENT_ERE", argType), 1); argument.addArgumentConfidencePair(date, 1.0F); e.addArgument(argument); } } ereDoc.setEventById(eventId, e); eventList.add(e); } return eventList.isEmpty() ? null : eventList; } /** * Gets the events. * * @param adeptDoc the adept doc * @param xmlDoc the xml doc * @return the events */ private List<Event> getEvents(Document adeptDoc, org.w3c.dom.Document xmlDoc, List<EntityMention> emList) { List<Event> eventList = new ArrayList<Event>(); Map<Long, EntityMention> mentionsById = new HashMap<Long, EntityMention>(); for (EntityMention mention : emList) { mentionsById.put(mention.getSequenceId(), mention); } NodeList evms = xmlDoc.getElementsByTagName("event_mention"); for (int x = 0; x < evms.getLength(); x++) { String eventIdString = evms.item(x).getParentNode().getAttributes().getNamedItem("id").getNodeValue(); String evmIdString = evms.item(x).getAttributes().getNamedItem("id").getNodeValue(); long eventId = Long.parseLong(eventIdString.replaceAll("e-", "")); long evmId = Long.parseLong(evmIdString.replaceAll("evm-", "")); String evmType = evms.item(x).getAttributes().getNamedItem("type").getNodeValue(); String evmSubtype = evms.item(x).getAttributes().getNamedItem("subtype").getNodeValue(); //create new Event from this event_mention Event e = new Event(eventId, OntTypeFactory.getInstance().getType("EVENT_ERE", evmType + "." + evmSubtype)); if (e.getType().contains("OTH")) System.out.println("OTHER:" + evmType + "." + evmSubtype); NodeList triggers = ((Element) evms.item(x)).getElementsByTagName("trigger"); for (int y = 0; y < triggers.getLength(); y++) { String argType = "trigger"; long offset = Long .parseLong(triggers.item(y).getAttributes().getNamedItem("offset").getNodeValue()); long length = Long .parseLong(triggers.item(y).getAttributes().getNamedItem("length").getNodeValue()); TokenOffset to = tokenOffsetFromOffset(offset, length, adeptDoc); if (to == null) return null; //add this as an argument, its type to e's attributes list Chunk trigger = new Chunk(to, adeptDoc.getDefaultTokenStream()); Argument argument = new Argument(OntTypeFactory.getInstance().getType("EVENT_ERE", argType), 1); argument.addArgumentConfidencePair(trigger, 1.0F); e.addArgument(argument); } NodeList args = ((Element) evms.item(x)).getElementsByTagName("arg"); for (int y = 0; y < args.getLength(); y++) { String argType = args.item(y).getAttributes().getNamedItem("type").getNodeValue(); String emIdString = args.item(y).getAttributes().getNamedItem("entity_mention_id").getNodeValue(); long emId = Long.parseLong(emIdString.replaceAll("m-", "")); //add this as an argument, its type to e's attributes list EntityMention em = mentionsById.get(emId); Argument argument = new Argument(new Type(argType), 1); argument.addArgumentConfidencePair(em, 1.0F); e.addArgument(argument); } NodeList places = ((Element) evms.item(x)).getElementsByTagName("place"); for (int y = 0; y < places.getLength(); y++) { String argType = "place"; String emIdString = places.item(y).getAttributes().getNamedItem("entity_mention_id").getNodeValue(); long emId = Long.parseLong(emIdString.replaceAll("m-", "")); //add this as an argument, its type to e's attributes list EntityMention em = mentionsById.get(emId); Argument argument = new Argument(OntTypeFactory.getInstance().getType("EVENT_ERE", argType), 1); argument.addArgumentConfidencePair(em, 1.0F); e.addArgument(argument); } NodeList dates = ((Element) evms.item(x)).getElementsByTagName("date"); for (int y = 0; y < dates.getLength(); y++) { String argType = "date"; NodeList date_extents = ((Element) dates.item(y)).getElementsByTagName("date_extent"); for (int z = 0; z < date_extents.getLength(); z++) { long offset = 0; long length = 0; try { offset = Long.parseLong( date_extents.item(z).getAttributes().getNamedItem("offset").getNodeValue()); length = Long.parseLong( date_extents.item(z).getAttributes().getNamedItem("length").getNodeValue()); } catch (NumberFormatException nfe) { continue; } TokenOffset to = tokenOffsetFromOffset(offset, length, adeptDoc); if (to == null) return null; //add this as an argument, its type to e's attributes list Chunk date = new Chunk(to, adeptDoc.getDefaultTokenStream()); Argument argument = new Argument(OntTypeFactory.getInstance().getType("EVENT_ERE", argType), 1); argument.addArgumentConfidencePair(date, 1.0F); e.addArgument(argument); } } eventList.add(e); } return eventList.isEmpty() ? null : eventList; } /** * Token offset from offset. * * @param offset the offset * @param length the length * @param ereDoc the ere doc * @return the token offset */ private TokenOffset tokenOffsetFromOffset(long offset, long length, EREDocument ereDoc) { long tokenStart = ereDoc.getTokenOffset(ereDoc.getCalculatedOffset(offset)); long tokenEnd = ereDoc.getTokenOffset(ereDoc.getCalculatedOffset(offset + length - 1)); TokenOffset to = new TokenOffset((int) tokenStart, (int) tokenEnd); if (tokenStart == -1 || tokenEnd == -1) { System.out.println(ereDoc.getFullText().substring((int) offset, (int) (offset + length))); System.out.println("Token range " + tokenStart + " to " + tokenEnd + " is out of bounds."); return null; } return to; } /** * Token offset from offset. * * @param offset the offset * @param length the length * @param adeptDoc the adept doc * @return the token offset */ private TokenOffset tokenOffsetFromOffset(long offset, long length, Document adeptDoc) { TokenStream tokenStream = adeptDoc.getDefaultTokenStream(); long tokenStart = -1; long tokenEnd = 10000000; boolean exactStart = false; boolean exactEnd = false; for (Token token : tokenStream) { if (!exactStart && token.getCharOffset().getBegin() == offset) { tokenStart = token.getSequenceId(); } else if (!exactStart && token.getCharOffset().getBegin() < offset && token.getCharOffset().getBegin() > tokenStart) { tokenStart = token.getSequenceId(); } if (!exactEnd && token.getCharOffset().getEnd() == offset + length - 1) { tokenEnd = token.getSequenceId(); } else if (!exactEnd && token.getCharOffset().getEnd() > offset + length - 1 && token.getCharOffset().getEnd() < tokenEnd) { tokenEnd = token.getSequenceId(); } } TokenOffset to = new TokenOffset((int) tokenStart, (int) tokenEnd); if (tokenStart == -1 || tokenEnd == -1) { System.out.println("Token range " + tokenStart + " to " + tokenEnd + " is out of bounds."); return null; } return to; } /** * Creates HltContentContainer corresponding to ERE transcription and annotation file. * * @param EREPath path to ERE transcription file * @param XMLPath path to ERE annotation file or null * @return an HLTContentContainer */ public HltContentContainer EREtoHltContentContainer(String EREPath, String XMLPath, String language) { org.w3c.dom.Document xmlDoc = readXML(XMLPath); String docId = getDocId(xmlDoc); EREDocument ereDoc = readEREFile(EREPath, docId, language); HltContentContainer hltcc = ereDoc.getHltContentContainer(); if (xmlDoc != null) { List<EntityMention> entityMentions = getEntityMentions(ereDoc, xmlDoc); if (entityMentions != null) { hltcc.setEntityMentions(entityMentions); hltcc.setNamedEntities(getNamedEntities(entityMentions)); hltcc.setRelations(getRelations(ereDoc, xmlDoc)); hltcc.setCoreferences(getCoreferences(ereDoc, xmlDoc, entityMentions)); List<Event> eventList = getEvents(ereDoc, xmlDoc); hltcc.setEvents(eventList); hltcc.setEventRelations(getEventRelations(ereDoc)); } } else System.out.println("XMLDoc is NULL"); return hltcc; } /** * Gets the pO ss. * * @param conllDoc the conll doc * @return the pO ss */ private List<PartOfSpeech> getPOSs(CoNLLDocument conllDoc) { List<PartOfSpeech> POSs = new ArrayList<PartOfSpeech>(); TokenStream ts = conllDoc.getDocument().getTokenStreamList().get(0); for (int x = 0; x < ts.size(); x++) { PartOfSpeech POS = new PartOfSpeech((long) x, new TokenOffset(x, x), ts); POS.setPosTag(new Type(conllDoc.getPOSByToken(ts.get(x)))); POSs.add(POS); } return POSs; } /** * Gets the entities. * * @param conllDoc the conll doc * @return the entities */ private HashMap<Integer, Entity> getEntities(CoNLLDocument conllDoc) { List<List<Pair<String, Long>>> entList = conllDoc.getNamedEntities(); TokenStream ts = conllDoc.getDocument().getTokenStreamList().get(0); HashMap<Integer, Entity> entityLocations = new HashMap<Integer, Entity>(); int runningSum = 0; for (int x = 0; x < entList.size(); x++) { int entityStart = 0; int entityEnd = 0; long entityId = (long) entList.get(x).get(entityEnd).getR(); ; while (entityEnd <= entList.get(x).size()) { if (entityId >= 0) { if (entityEnd == entList.get(x).size() || (long) entList.get(x).get(entityEnd).getR() != entityId) { EntityMention em = new EntityMention(entityId, new TokenOffset(runningSum + entityStart, runningSum + entityEnd - 1), ts); Map<Long, Float> distribution = new HashMap<Long, Float>(); distribution.put(entityId, 1F); em.setMentionType(new Type(entList.get(x).get(entityStart).getL().substring(0, 3))); em.setEntityType(new Type("NAM")); Entity e = new Entity(entityId, new Type("NAM")); e.setCanonicalMentions(em); for (int y = entityStart; y < entityEnd; y++) entityLocations.put(runningSum + y, e); } } if (entityEnd == entList.get(x).size()) break; long prevId = entityId; entityId = (long) entList.get(x).get(entityEnd).getR(); if (entityId != prevId) entityStart = entityEnd; entityEnd++; } runningSum += entList.get(x).size(); } return entityLocations; } /** * Gets the entity mentions. * * @param conllDocument the conll document * @param entities the entities * @return the entity mentions */ private List<EntityMention> getEntityMentions(CoNLLDocument conllDocument, List<Entity> entities) { List<EntityMention> entityMentions = new ArrayList<EntityMention>(); for (Entity e : entities) entityMentions.add(e.getCanonicalMention()); return entityMentions; } /** * Gets the coreferences. * * @param conllDocument the conll document * @param entityMap the entity map * @param entityMentions the entity mentions * @return the coreferences */ private List<Coreference> getCoreferences(CoNLLDocument conllDocument, Map<Integer, Entity> entityMap, List<EntityMention> entityMentions) { Map<Long, Set<Entity>> corefEntityMap = new HashMap<Long, Set<Entity>>(); List<List<Set<Long>>> corefIds = conllDocument.getCorefs(); List<Coreference> corefs = new ArrayList<Coreference>(); for (int x = 0; x < corefIds.size(); x++) { for (int y = 0; y < corefIds.get(x).size(); y++) { for (long corefId : corefIds.get(x).get(y)) { if (corefEntityMap.get(corefId) == null) { Set<Entity> entities = new HashSet<Entity>(); corefEntityMap.put(corefId, entities); } if (entityMap.get(x + y) != null) corefEntityMap.get(corefId).add(entityMap.get(x + y)); } } } for (long corefId : corefEntityMap.keySet()) { Coreference c = new Coreference(corefId); c.setResolvedMentions(entityMentions); c.setEntities(new ArrayList<Entity>(corefEntityMap.get(corefId))); corefs.add(c); } return corefs; } /** * Gets the sentences. * * @param conllDoc the conll doc * @return the sentences */ public List<Sentence> getSentences(CoNLLDocument conllDoc) { List<List<String>> tokens = conllDoc.getTokens(); List<Sentence> sentences = new ArrayList<Sentence>(); int runningOffset = 0; for (int x = 0; x < tokens.size(); x++) { sentences.add( new Sentence((long) x, new TokenOffset(runningOffset, runningOffset + tokens.get(x).size() - 1), conllDoc.getDocument().getTokenStreamList().get(0))); runningOffset += tokens.get(x).size(); } return sentences; } /** * Reads specified CONLL file into an CONLLDocument (Document object wrapper). * * @param path Path to conll file to read. * @return an CONLLDocument with fields derived from file at path. */ public CoNLLDocument readCoNLLFile(String path) { String filestring = readFileIntoString(path); CoNLLDocument conllDoc = new CoNLLDocument(filestring); conllDoc.createDocument(); return conllDoc; } /** * Creates HltContentContainer corresponding to CoNLL 2011 file. * * @param filepath the filepath * @return an HLTContentContainer */ public HltContentContainer CoNLLtoHltContentContainer(String filepath) { CoNLLDocument conllDoc = readCoNLLFile(filepath); HltContentContainer hltcc = new HltContentContainer(); HashMap<Integer, Entity> entityMap = getEntities(conllDoc); HashSet<Entity> entitySet = new HashSet<Entity>(); for (Integer i : entityMap.keySet()) entitySet.add(entityMap.get(i)); List<Entity> entities = new ArrayList<Entity>(entitySet); List<EntityMention> entityMentions = getEntityMentions(conllDoc, entities); hltcc.setSentences(getSentences(conllDoc)); hltcc.setPartOfSpeechs(getPOSs(conllDoc)); hltcc.setEntityMentions(entityMentions); hltcc.setNamedEntities(entityMentions); hltcc.setCoreferences(getCoreferences(conllDoc, entityMap, entityMentions)); return hltcc; } /** * Creates HltContentContainer corresponding to CoNLL 2011 file. * * @param filepath the filepath * @return an HLTContentContainer */ public HltContentContainer LDCForumtoHltContentContainer(String filepath, String XMLPath, String language) { try { org.w3c.dom.Document xmlDoc = readXML(XMLPath); // This HLTCC is discarded. HltContentContainer hltContentContainer = new HltContentContainer(); Document document = DocumentMaker.getInstance().createDefaultDocument(filepath, hltContentContainer); String text = null; text = fileToString(filepath); document.setValue(text); List<PassageAttributes> passageAttributesList = new ArrayList<PassageAttributes>(); // Pair<List<Tag>,adept.common.Document> documentPairs = LDCCorpusReader.getInstance() // .readCorpus(text, passageAttributesList, null, // null, // null); HltContentContainer hltcc = LDCCorpusReader.getInstance().readPosts(text, passageAttributesList, null, null, language); if (hltcc.getPassages() == null) { hltcc.setPassages(new ArrayList<Passage>()); } if (hltcc.getSentences() == null) { hltcc.setSentences(new ArrayList<Sentence>()); } if (xmlDoc != null) { List<EntityMention> entityMentions = getEntityMentions(document, xmlDoc); if (entityMentions != null) { hltcc.setEntityMentions(entityMentions); hltcc.setNamedEntities(getNamedEntities(entityMentions)); hltcc.setRelations(getRelations(document, xmlDoc, entityMentions)); hltcc.setCoreferences(getCoreferences(document, xmlDoc, entityMentions)); List<Event> eventList = getEvents(document, xmlDoc, entityMentions); hltcc.setEvents(eventList); // hltcc.setEventRelations(getEventRelations(document)); } } return hltcc; } catch (Exception e) { e.printStackTrace(); return null; } } /** * Find stream in classpath or file system. * * @param name the name * @return the input stream * @throws FileNotFoundException the file not found exception */ public static BOMInputStream findStreamInClasspathOrFileSystem(String name) throws FileNotFoundException { InputStream is = Reader.class.getClassLoader().getResourceAsStream(name); BOMInputStream bis = new BOMInputStream(is, false); if (is == null) { is = Reader.class.getClassLoader().getResourceAsStream(name.replaceAll("\\\\", "/")); bis = new BOMInputStream(is, false); if (is == null) { File f = new File(name); if (!f.exists()) { System.out.println("Warning - creating FileInputStream: " + f.getAbsolutePath()); } is = new FileInputStream(name); bis = new BOMInputStream(is, false); //System.out.println("Reading InputStream: " + f.getAbsolutePath()); } else { System.out.println("InputStream found in Class Loader after adjusting separators."); } } else { //System.out.println("InputStream found in Class Loader."); } return bis; } /** * Gets the absolute path from classpath or file system. * * @param name the name * @return the absolute path from classpath or file system * @throws FileNotFoundException the file not found exception */ public static String getAbsolutePathFromClasspathOrFileSystem(String name) throws FileNotFoundException { if (name.startsWith("/")) return name; // // System.out.println("To find the absolute path of: " + name); URL url = null; try { url = Reader.class.getClassLoader().getResource(name); if (url != null) return url.toURI().getPath(); else return null; } catch (Exception e) { e.printStackTrace(); return null; } } /** * List directory contents for a resource folder. Not recursive. * This is basically a brute-force implementation. * Works for regular files and also JARs. * * @param clazz Any java class that lives in the same place as the resources you want. * @param path Should end with "/", but not start with one. * @return Just the name of each member item, not the full paths. * @throws URISyntaxException the uRI syntax exception * @throws IOException Signals that an I/O exception has occurred. * @author Greg Briggs */ public static String[] getResourceListing(Class clazz, String path) throws URISyntaxException, IOException { URL dirURL = clazz.getClassLoader().getResource(path); if (dirURL != null && dirURL.getProtocol().equals("file")) { /* A file path: easy enough */ return new File(dirURL.toURI()).list(); } if (dirURL == null) { /* * In case of a jar file, we can't actually find a directory. * Have to assume the same jar as clazz. */ String me = clazz.getName().replace(".", "/") + ".class"; dirURL = clazz.getClassLoader().getResource(me); } if (dirURL.getProtocol().equals("jar")) { /* A JAR path */ String jarPath = dirURL.getPath().substring(5, dirURL.getPath().indexOf("!")); //strip out only the JAR file JarFile jar = new JarFile(URLDecoder.decode(jarPath, "UTF-8")); Enumeration<JarEntry> entries = jar.entries(); //gives ALL entries in jar Set<String> result = new HashSet<String>(); //avoid duplicates in case it is a subdirectory while (entries.hasMoreElements()) { String name = entries.nextElement().getName(); if (name.startsWith(path)) { //filter according to the path String entry = name.substring(path.length()); int checkSubdir = entry.indexOf("/"); if (checkSubdir >= 0) { // if it is a subdirectory, we just return the directory name entry = entry.substring(0, checkSubdir); } result.add(entry); } } return result.toArray(new String[result.size()]); } throw new UnsupportedOperationException("Cannot list files for URL " + dirURL); } /** * Reads specified file into a string. * * @param path the path * @return the string */ public String readFileIntoString(String path) { try { String absolutePath = path; // String absolutePath = getAbsolutePathFromClasspathOrFileSystem(path); FileInputStream stream = new FileInputStream(new File(absolutePath)); try { FileChannel fc = stream.getChannel(); MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()); /* Instead of using default, pass in a decoder. */ return Charset.forName("UTF-8").decode(bb).toString(); } finally { stream.close(); } } catch (IOException e) { e.printStackTrace(); } return null; } /** * Reads specified file into a byte array. * * @param path the path * @return the byte[] */ public byte[] readFileIntoByteArray(String path) { try { RandomAccessFile f = new RandomAccessFile(path, "r"); byte[] b = new byte[(int) f.length()]; f.read(b); f.close(); return b; } catch (IOException e) { e.printStackTrace(); } return null; } /** * Read file into lines. * * @param filename the filename * @param lines the lines * @return the string */ public String readFileIntoLines(String filename, List<String> lines) { if (lines == null) lines = new ArrayList<String>(); String line = ""; StringBuffer sb = new StringBuffer(); try { BufferedReader in = new BufferedReader(new InputStreamReader( new BOMInputStream(new FileInputStream(new File(filename)), false), "UTF-8")); while ((line = in.readLine()) != null) { if (!line.isEmpty()) { String surrogatesRemoved = checkSurrogates(line); lines.add(surrogatesRemoved); sb.append(surrogatesRemoved); sb.append("\n"); } } in.close(); } catch (IOException ex) { throw new RuntimeException(ex); } return sb.toString(); } /** * File to lines. * * @param filename the filename * @return the list */ public List<String> fileToLines(String filename) { List<String> lines = new LinkedList<String>(); String line = ""; try { BufferedReader in = new BufferedReader(new InputStreamReader( new BOMInputStream(new FileInputStream(new File(filename)), false), "UTF-8")); while ((line = in.readLine()) != null) { lines.add(checkSurrogates(line)); } in.close(); } catch (IOException ex) { throw new RuntimeException(ex); } return lines; } /** * Convert stream to string. * * @param is the is * @return the string * @throws IOException Signals that an I/O exception has occurred. */ public String convertStreamToString(java.io.InputStream is) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(is)); StringBuilder out = new StringBuilder(); String line; while ((line = reader.readLine()) != null) { out.append(line); } reader.close(); return out.toString(); } /** * Removes surrogate pairs * * @param text * @return */ public static String checkSurrogates(String text) { StringBuffer buffer = new StringBuffer(); char[] chars = text.toCharArray(); for (Character c : chars) { if (Character.isHighSurrogate(c) || Character.isLowSurrogate(c)) { System.out.println("WARNING -- invalid xml character " + c + " removed"); } else { buffer.append(c); } } return buffer.toString(); } /** * File to string. * * @param filename the filename * @return the list */ private String fileToString(String filename) { String lines = ""; String line = ""; try { BufferedReader in = new BufferedReader(new InputStreamReader( new BOMInputStream(new FileInputStream(new File(filename)), false), "UTF-8")); while ((line = in.readLine()) != null) { lines = lines + line + "\n"; } in.close(); } catch (IOException ex) { throw new RuntimeException(ex); } return lines; } }