List of usage examples for opennlp.tools.cmdline CmdLineUtil openInFile
public static FileInputStream openInFile(File file)
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
private static void absa2015TargetNullToNAFNER(KAFDocument kaf, String fileName, String language, String nullDict) {//from w w w. jav a 2 s. c o m //reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try { Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); //REMOVE UNUSUAL CLASSES List<String> removeClass = new ArrayList<>(); /*removeClass.add("DRINKS#PRICES"); removeClass.add("LOCATION#GENERAL"); removeClass.add("DRINKS#STYLE_OPTIONS"); removeClass.add("DRINKS#QUALITY"); removeClass.add("RESTAURANT#PRICES"); removeClass.add("FOOD#PRICES"); removeClass.add("RESTAURANT#MISCELLANEOUS"); */ //Used opinion target tokens List<String> usedTargets = new ArrayList<>(); for (Element sent : sentences) { Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String targetString = opinion.getAttributeValue("target"); String[] targetStringSplitted = targetString.split("\\s+"); for (String str : targetStringSplitted) { if (!usedTargets.contains(str.toLowerCase())) { usedTargets.add(str.toLowerCase()); } } } } } //naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); //sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); //the list contains just one list of tokens List<List<Token>> segmentedSentence = tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<WF>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } //going through every opinion element for each sentence //each opinion element can contain one or more opinions Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { //iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String category = opinion.getAttributeValue("category"); String targetString = opinion.getAttributeValue("target"); System.err.println("-> " + sentId + " | " + category + ", " + targetString); //adding OTE if (!targetString.equalsIgnoreCase("NULL")) { if (!removeClass.contains(category)) { int fromOffset = Integer.parseInt(opinion.getAttributeValue("from")); int toOffset = Integer.parseInt(opinion.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { //span is +1 with respect to the last token of the span endIndex = i + 1; } } List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = getWFIdsFromTerms(sentTerms); if (checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType(category); } } } else { Path path = Paths.get(nullDict); InputStream in = CmdLineUtil.openInFile(path.toFile()); Dictionary dictionary = new Dictionary(in); int startIndex = -1; List<Integer> startIndexs = new ArrayList<>(); for (WF wf : sentWFs) { startIndex += 1; String word = wf.getForm(); if (dictionary.lookup(word.toLowerCase()) != null) { if (dictionary.lookup(word.toLowerCase()).equalsIgnoreCase(category) && !usedTargets.contains(word.toLowerCase()) && !removeClass.contains(category)) { startIndexs.add(startIndex); break; } } } for (Integer indexes : startIndexs) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, indexes, indexes + 1)); List<String> wfTermIds = getWFIdsFromTerms(sentTerms); if (checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType(category); } } } } } } //end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java
private static void absa2015TargetNullToNAFNER_TARGET(KAFDocument kaf, String fileName, String language, String nullDict) {//w ww. j a v a2 s . co m //reading the ABSA xml file SAXBuilder sax = new SAXBuilder(); XPathFactory xFactory = XPathFactory.instance(); try { Document doc = sax.build(fileName); XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element()); List<Element> sentences = expr.evaluate(doc); //Used opinion target tokens List<String> usedTargets = new ArrayList<>(); for (Element sent : sentences) { Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String targetString = opinion.getAttributeValue("target"); String[] targetStringSplitted = targetString.split("\\s+"); for (String str : targetStringSplitted) { if (!usedTargets.contains(str.toLowerCase())) { usedTargets.add(str.toLowerCase()); } } } } } //naf sentence counter int counter = 1; for (Element sent : sentences) { List<Integer> wfFromOffsets = new ArrayList<>(); List<Integer> wfToOffsets = new ArrayList<>(); List<WF> sentWFs = new ArrayList<>(); List<Term> sentTerms = new ArrayList<>(); //sentence id and original text String sentId = sent.getAttributeValue("id"); String sentString = sent.getChildText("text"); //the list contains just one list of tokens List<List<Token>> segmentedSentence = tokenizeSentence(sentString, language); for (List<Token> sentence : segmentedSentence) { for (Token token : sentence) { WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter); wf.setXpath(sentId); final List<WF> wfTarget = new ArrayList<WF>(); wfTarget.add(wf); wfFromOffsets.add(wf.getOffset()); wfToOffsets.add(wf.getOffset() + wf.getLength()); sentWFs.add(wf); Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget)); term.setPos("O"); term.setLemma(token.getTokenValue()); sentTerms.add(term); } } counter++; String[] tokenIds = new String[sentWFs.size()]; for (int i = 0; i < sentWFs.size(); i++) { tokenIds[i] = sentWFs.get(i).getId(); } //going through every opinion element for each sentence //each opinion element can contain one or more opinions Element opinionsElement = sent.getChild("Opinions"); if (opinionsElement != null) { //iterating over every opinion in the opinions element List<Element> opinionList = opinionsElement.getChildren(); for (Element opinion : opinionList) { String category = opinion.getAttributeValue("category"); String targetString = opinion.getAttributeValue("target"); System.err.println("-> " + category + ", " + targetString); //adding OTE if (!targetString.equalsIgnoreCase("NULL")) { int fromOffset = Integer.parseInt(opinion.getAttributeValue("from")); int toOffset = Integer.parseInt(opinion.getAttributeValue("to")); int startIndex = -1; int endIndex = -1; for (int i = 0; i < wfFromOffsets.size(); i++) { if (wfFromOffsets.get(i) == fromOffset) { startIndex = i; } } for (int i = 0; i < wfToOffsets.size(); i++) { if (wfToOffsets.get(i) == toOffset) { //span is +1 with respect to the last token of the span endIndex = i + 1; } } List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex)); List<String> wfTermIds = getWFIdsFromTerms(sentTerms); if (checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType("TARGET"); } } else { Path path = Paths.get(nullDict); InputStream in = CmdLineUtil.openInFile(path.toFile()); Dictionary dictionary = new Dictionary(in); int startIndex = -1; List<Integer> startIndexs = new ArrayList<>(); for (WF wf : sentWFs) { startIndex += 1; String word = wf.getForm(); if (dictionary.lookup(word.toLowerCase()) != null) { if (dictionary.lookup(word.toLowerCase()).equalsIgnoreCase(category) && !usedTargets.contains(word.toLowerCase())) { startIndexs.add(startIndex); break; } } } for (Integer indexes : startIndexs) { List<String> wfIds = Arrays .asList(Arrays.copyOfRange(tokenIds, indexes, indexes + 1)); List<String> wfTermIds = getWFIdsFromTerms(sentTerms); if (checkTermsRefsIntegrity(wfIds, wfTermIds)) { List<Term> nameTerms = kaf.getTermsFromWFs(wfIds); ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms); List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>(); references.add(neSpan); Entity neEntity = kaf.newEntity(references); neEntity.setType("TARGET"); } } } } } } //end of sentence } catch (JDOMException | IOException e) { e.printStackTrace(); } }
From source file:es.ehu.si.ixa.pipe.convert.Convert.java
/** * Do not print a sentence if is less than 90% lowercase. * //w w w. java 2 s. c o m * @param sentences * the list of sentences * @return the list of sentences that contain more than 90% lowercase * characters * @throws IOException */ private String brownCleanUpperCase(File inFile) throws IOException { InputStream inputStream = CmdLineUtil.openInFile(inFile); StringBuilder precleantext = new StringBuilder(); BufferedReader breader = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("UTF-8"))); String line; while ((line = breader.readLine()) != null) { double lowercaseCounter = 0; StringBuilder sb = new StringBuilder(); String[] lineArray = line.split(" "); for (String word : lineArray) { if (lineArray.length > 0) { sb.append(word); } } char[] lineCharArray = sb.toString().toCharArray(); for (char lineArr : lineCharArray) { if (Character.isLowerCase(lineArr)) { lowercaseCounter++; } } double percent = lowercaseCounter / (double) lineCharArray.length; if (percent >= 0.90) { precleantext.append(line).append("\n"); } } breader.close(); return precleantext.toString(); }