Example usage for opennlp.tools.cmdline CmdLineUtil openInFile

List of usage examples for opennlp.tools.cmdline CmdLineUtil openInFile

Introduction

In this page you can find the example usage for opennlp.tools.cmdline CmdLineUtil openInFile.

Prototype

public static FileInputStream openInFile(File file) 

Source Link

Usage

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

private static void absa2015TargetNullToNAFNER(KAFDocument kaf, String fileName, String language,
        String nullDict) {//from  w  w w. jav a 2 s. c  o m
    //reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        //REMOVE UNUSUAL CLASSES
        List<String> removeClass = new ArrayList<>();
        /*removeClass.add("DRINKS#PRICES");
        removeClass.add("LOCATION#GENERAL");
        removeClass.add("DRINKS#STYLE_OPTIONS");
        removeClass.add("DRINKS#QUALITY");
        removeClass.add("RESTAURANT#PRICES");
        removeClass.add("FOOD#PRICES");
        removeClass.add("RESTAURANT#MISCELLANEOUS");
        */

        //Used opinion target tokens
        List<String> usedTargets = new ArrayList<>();
        for (Element sent : sentences) {
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String targetString = opinion.getAttributeValue("target");

                    String[] targetStringSplitted = targetString.split("\\s+");
                    for (String str : targetStringSplitted) {
                        if (!usedTargets.contains(str.toLowerCase())) {
                            usedTargets.add(str.toLowerCase());
                        }
                    }
                }
            }
        }

        //naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            //sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            //the list contains just one list of tokens
            List<List<Token>> segmentedSentence = tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<WF>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            //going through every opinion element for each sentence
            //each opinion element can contain one or more opinions
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                //iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String category = opinion.getAttributeValue("category");
                    String targetString = opinion.getAttributeValue("target");
                    System.err.println("-> " + sentId + " | " + category + ", " + targetString);
                    //adding OTE
                    if (!targetString.equalsIgnoreCase("NULL")) {
                        if (!removeClass.contains(category)) {
                            int fromOffset = Integer.parseInt(opinion.getAttributeValue("from"));
                            int toOffset = Integer.parseInt(opinion.getAttributeValue("to"));
                            int startIndex = -1;
                            int endIndex = -1;
                            for (int i = 0; i < wfFromOffsets.size(); i++) {
                                if (wfFromOffsets.get(i) == fromOffset) {
                                    startIndex = i;
                                }
                            }
                            for (int i = 0; i < wfToOffsets.size(); i++) {
                                if (wfToOffsets.get(i) == toOffset) {
                                    //span is +1 with respect to the last token of the span
                                    endIndex = i + 1;
                                }
                            }
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                            List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
                            if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType(category);
                            }
                        }
                    } else {
                        Path path = Paths.get(nullDict);
                        InputStream in = CmdLineUtil.openInFile(path.toFile());
                        Dictionary dictionary = new Dictionary(in);
                        int startIndex = -1;
                        List<Integer> startIndexs = new ArrayList<>();
                        for (WF wf : sentWFs) {
                            startIndex += 1;
                            String word = wf.getForm();
                            if (dictionary.lookup(word.toLowerCase()) != null) {
                                if (dictionary.lookup(word.toLowerCase()).equalsIgnoreCase(category)
                                        && !usedTargets.contains(word.toLowerCase())
                                        && !removeClass.contains(category)) {
                                    startIndexs.add(startIndex);
                                    break;
                                }
                            }
                        }
                        for (Integer indexes : startIndexs) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, indexes, indexes + 1));
                            List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
                            if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType(category);
                            }
                        }

                    }
                }
            }
        } //end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:eus.ixa.ixa.pipe.convert.AbsaSemEval.java

private static void absa2015TargetNullToNAFNER_TARGET(KAFDocument kaf, String fileName, String language,
        String nullDict) {//w  ww. j  a v a2  s . co m
    //reading the ABSA xml file
    SAXBuilder sax = new SAXBuilder();
    XPathFactory xFactory = XPathFactory.instance();
    try {
        Document doc = sax.build(fileName);
        XPathExpression<Element> expr = xFactory.compile("//sentence", Filters.element());
        List<Element> sentences = expr.evaluate(doc);

        //Used opinion target tokens
        List<String> usedTargets = new ArrayList<>();
        for (Element sent : sentences) {
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String targetString = opinion.getAttributeValue("target");

                    String[] targetStringSplitted = targetString.split("\\s+");
                    for (String str : targetStringSplitted) {
                        if (!usedTargets.contains(str.toLowerCase())) {
                            usedTargets.add(str.toLowerCase());
                        }
                    }
                }
            }
        }

        //naf sentence counter
        int counter = 1;
        for (Element sent : sentences) {
            List<Integer> wfFromOffsets = new ArrayList<>();
            List<Integer> wfToOffsets = new ArrayList<>();
            List<WF> sentWFs = new ArrayList<>();
            List<Term> sentTerms = new ArrayList<>();
            //sentence id and original text
            String sentId = sent.getAttributeValue("id");
            String sentString = sent.getChildText("text");
            //the list contains just one list of tokens
            List<List<Token>> segmentedSentence = tokenizeSentence(sentString, language);
            for (List<Token> sentence : segmentedSentence) {
                for (Token token : sentence) {
                    WF wf = kaf.newWF(token.startOffset(), token.getTokenValue(), counter);
                    wf.setXpath(sentId);
                    final List<WF> wfTarget = new ArrayList<WF>();
                    wfTarget.add(wf);
                    wfFromOffsets.add(wf.getOffset());
                    wfToOffsets.add(wf.getOffset() + wf.getLength());
                    sentWFs.add(wf);
                    Term term = kaf.newTerm(KAFDocument.newWFSpan(wfTarget));
                    term.setPos("O");
                    term.setLemma(token.getTokenValue());
                    sentTerms.add(term);
                }
            }
            counter++;
            String[] tokenIds = new String[sentWFs.size()];
            for (int i = 0; i < sentWFs.size(); i++) {
                tokenIds[i] = sentWFs.get(i).getId();
            }
            //going through every opinion element for each sentence
            //each opinion element can contain one or more opinions
            Element opinionsElement = sent.getChild("Opinions");
            if (opinionsElement != null) {
                //iterating over every opinion in the opinions element
                List<Element> opinionList = opinionsElement.getChildren();
                for (Element opinion : opinionList) {
                    String category = opinion.getAttributeValue("category");
                    String targetString = opinion.getAttributeValue("target");
                    System.err.println("-> " + category + ", " + targetString);
                    //adding OTE
                    if (!targetString.equalsIgnoreCase("NULL")) {
                        int fromOffset = Integer.parseInt(opinion.getAttributeValue("from"));
                        int toOffset = Integer.parseInt(opinion.getAttributeValue("to"));
                        int startIndex = -1;
                        int endIndex = -1;
                        for (int i = 0; i < wfFromOffsets.size(); i++) {
                            if (wfFromOffsets.get(i) == fromOffset) {
                                startIndex = i;
                            }
                        }
                        for (int i = 0; i < wfToOffsets.size(); i++) {
                            if (wfToOffsets.get(i) == toOffset) {
                                //span is +1 with respect to the last token of the span
                                endIndex = i + 1;
                            }
                        }
                        List<String> wfIds = Arrays.asList(Arrays.copyOfRange(tokenIds, startIndex, endIndex));
                        List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
                        if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                            List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                            ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                            List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                            references.add(neSpan);
                            Entity neEntity = kaf.newEntity(references);
                            neEntity.setType("TARGET");
                        }
                    } else {
                        Path path = Paths.get(nullDict);
                        InputStream in = CmdLineUtil.openInFile(path.toFile());
                        Dictionary dictionary = new Dictionary(in);
                        int startIndex = -1;
                        List<Integer> startIndexs = new ArrayList<>();
                        for (WF wf : sentWFs) {
                            startIndex += 1;
                            String word = wf.getForm();
                            if (dictionary.lookup(word.toLowerCase()) != null) {
                                if (dictionary.lookup(word.toLowerCase()).equalsIgnoreCase(category)
                                        && !usedTargets.contains(word.toLowerCase())) {
                                    startIndexs.add(startIndex);
                                    break;
                                }
                            }
                        }
                        for (Integer indexes : startIndexs) {
                            List<String> wfIds = Arrays
                                    .asList(Arrays.copyOfRange(tokenIds, indexes, indexes + 1));
                            List<String> wfTermIds = getWFIdsFromTerms(sentTerms);
                            if (checkTermsRefsIntegrity(wfIds, wfTermIds)) {
                                List<Term> nameTerms = kaf.getTermsFromWFs(wfIds);
                                ixa.kaflib.Span<Term> neSpan = KAFDocument.newTermSpan(nameTerms);
                                List<ixa.kaflib.Span<Term>> references = new ArrayList<ixa.kaflib.Span<Term>>();
                                references.add(neSpan);
                                Entity neEntity = kaf.newEntity(references);
                                neEntity.setType("TARGET");
                            }
                        }

                    }
                }
            }
        } //end of sentence
    } catch (JDOMException | IOException e) {
        e.printStackTrace();
    }
}

From source file:es.ehu.si.ixa.pipe.convert.Convert.java

/**
 * Do not print a sentence if is less than 90% lowercase.
 * //w  w w. java 2  s. c  o  m
 * @param sentences
 *          the list of sentences
 * @return the list of sentences that contain more than 90% lowercase
 *         characters
 * @throws IOException
 */
private String brownCleanUpperCase(File inFile) throws IOException {
    InputStream inputStream = CmdLineUtil.openInFile(inFile);
    StringBuilder precleantext = new StringBuilder();
    BufferedReader breader = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("UTF-8")));
    String line;
    while ((line = breader.readLine()) != null) {
        double lowercaseCounter = 0;
        StringBuilder sb = new StringBuilder();
        String[] lineArray = line.split(" ");
        for (String word : lineArray) {
            if (lineArray.length > 0) {
                sb.append(word);
            }
        }
        char[] lineCharArray = sb.toString().toCharArray();
        for (char lineArr : lineCharArray) {
            if (Character.isLowerCase(lineArr)) {
                lowercaseCounter++;
            }
        }
        double percent = lowercaseCounter / (double) lineCharArray.length;
        if (percent >= 0.90) {
            precleantext.append(line).append("\n");
        }
    }
    breader.close();
    return precleantext.toString();
}