Example usage for java.util LinkedList subList

List of usage examples for java.util LinkedList subList

Introduction

In this page you can find the example usage for java.util LinkedList subList.

Prototype

List<E> subList(int fromIndex, int toIndex);

Source Link

Document

Returns a view of the portion of this list between the specified fromIndex , inclusive, and toIndex , exclusive.

Usage

From source file:elh.eus.absa.Features.java

/**
 *  Extract n-grams up to a certain length from an Conll tabulated format string.
 * /* w w w . ja v a 2  s  .c o  m*/
 * @param String input : input tagged conll string 
 * @param int length : which 'n' use for 'n-grams' 
 * @param string type (wf|lemma|pos): what type of ngrams we want to extract.
 * @param boolean save : safe ngrams to file or not. 
 * @return int success: return 1 if the process ended correctly
 */
private int extractNgramsTABString(InputStream input, int length, String type, List<String> discardPos,
        boolean save) {
    //System.err.println("ngram extraction Tab: _"+length+"_"+type);
    if (length == 0) {
        return 0;
    }

    //System.err.println("ngram extraction, corpus sentences: "+corpus.getSentences().get(sent));                 
    //String[] tokens = input.split("\n");
    BufferedReader reader = new BufferedReader(new InputStreamReader(input));
    LinkedList<String> ngrams = new LinkedList<String>();
    String line;
    try {
        while ((line = reader.readLine()) != null) {
            String ngram = "";
            String[] fields = line.split("\\s");
            String pos = "";
            switch (type) {
            case "wf":
                ngram = fields[0];
                break;
            case "lemma":
                if (fields.length > 1) {
                    ngram = fields[1];
                }
                if (fields.length > 2) {
                    pos = fields[2];
                }
                break;
            case "pos":
                if (fields.length > 2) {
                    ngram = fields[2];
                    switch (ngram.length()) {
                    case 0:
                        ngram = "-";
                        break;
                    case 1:
                        ngram = ngram.substring(0, 1);
                        break;
                    default:
                        ngram = ngram.substring(0, 2);
                        break;
                    }
                }
            }

            //if the is a blank line we assume sentence has ended and we empty and re-initialize the n-gram list 
            if (ngram.equals("")) {
                //empty n-gram list and add remaining n-grams to the feature list
                while (!ngrams.isEmpty()) {
                    String ng = featureFromArray(ngrams, type);
                    addNgram(type, ng);
                    ngrams.removeFirst();
                }
                continue;
            }

            if (ngrams.size() >= length) {
                ngrams.removeFirst();
            }

            //if no alphanumeric char is present discard the element as invalid ngram. Or if it has a PoS tag that
            //should be discarded
            String lCurrent = ngram;
            if ((!discardPos.contains(pos)) && (!ngram.matches("^[^\\p{L}\\p{M}\\p{Nd}\\p{InEmoticons}]+$"))
                    && (lCurrent.length() > 1)) {
                //standarize numeric values to NUMNUM lemma value
                //ngram.replaceFirst("^[0-9]$", "NUMNUM");
                if (!type.equalsIgnoreCase("pos")) {
                    ngrams.add(normalize(ngram, params.getProperty("normalization", "none")));
                } else {
                    ngrams.add(ngram);
                }
            }
            //certain punctuation marks are allowed as lemmas
            else if ((lCurrent.length() < 2) && (lCurrent.matches("[,;.?!]"))) {
                ngrams.add(lCurrent);
            }

            // add ngrams to the feature list
            for (int i = 0; i < ngrams.size(); i++) {
                String ng = featureFromArray(ngrams.subList(0, i + 1), type);
                addNgram(type, ng);
            }
        }
    } catch (IOException e) {
        System.err.println("EliXa::Features::extractNgramsTABString - WARNING: Error reading tagged file, "
                + "ngram extraction may be only partial\n");
    }

    //empty ngram list and add remaining ngrams to the feature list
    while (!ngrams.isEmpty()) {
        String ng = featureFromArray(ngrams, type);
        addNgram(type, ng);
        ngrams.removeFirst();
    }

    return 1;
}

From source file:elh.eus.absa.Features.java

/**
 *   Function fills the attribute vectors for the instances existing in the corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //  w  w  w .j a  va  2 s . co  m
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstances(boolean save, String prefix) throws IOException {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //Properties posProp = new Properties();
    //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);      
    if (params.containsKey("lemmaNgrams")) {
        Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"),
                corpus.getLang(), "3", "bin", "false");

        postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        // string normalization (emoticons, twitter grammar,...)
        String opNormalized = corpus.getOpinionSentence(oId);

        // compute uppercase ratio before normalization (if needed)      
        double upRatio = 0.0;
        if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) {
            String upper = opNormalized.replaceAll("[\\p{Ll}]", "");
            upRatio = (double) upper.length() / (double) opNormalized.length();
            values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        }

        // string normalization (emoticons, twitter grammar,...)
        if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams"))
                && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) {
            opNormalized = normalize(opNormalized, params.getProperty("normalization", "none"));
        }

        //process the current instance with the NLP pipeline in order to get token and lemma|pos features
        KAFDocument nafinst = new KAFDocument("", "");
        String nafname = trainExamples.get(oId).getsId().replace(':', '_');
        String nafDir = params.getProperty("kafDir");
        String nafPath = nafDir + File.separator + nafname + ".kaf";
        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = 1;
        try {
            if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty()))
            {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(),
                            params.getProperty("pos-model"), postagger);
                    Files.createDirectories(Paths.get(nafDir));
                    nafinst.save(nafPath);
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));
            } else {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang());
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));

            }
        } catch (IOException | JDOMException e) {
            System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId
                    + "|" + oId + ") for filling the attribute vector");
            e.printStackTrace();
            System.exit(5);
        }

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        List<WF> window = nafinst.getWFs();
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer to = window.size();
            Integer from = 0;
            end++;
            for (int i = 0; i < window.size(); i++) {
                WF wf = window.get(i);
                if ((wf.getOffset() == start) && (i >= bowWin)) {
                    from = i - bowWin;
                } else if (wf.getOffset() >= end) {
                    if (i + bowWin < window.size()) {
                        to = i + bowWin;
                    }
                    break;
                }
            }
            window = window.subList(from, to);
            //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to);
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        List<String> windowWFIds = new ArrayList<String>();

        // word form ngram related features
        for (WF wf : window) {
            windowWFIds.add(wf.getId());

            String wfStr = wf.getForm();
            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum

            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (Term t : nafinst.getTermsFromWFs(windowWFIds)) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    String lemma = t.getLemma();

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // add ngrams to the feature vector
                    for (int i = 0; i < ngrams.size(); i++) {
                        String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma");
                        //if the current lemma is in the ngram list activate the feature in the vector
                        if (params.containsKey("lemmaNgrams")
                                && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                            Attribute ngAtt = rsltdata.attribute(ng);
                            if (ngAtt != null) {
                                addNumericToFeatureVector(ng, values, 1); //tokNum                     
                            }
                        }

                        ng = featureFromArray(ngrams.subList(0, i + 1), "");
                        if (params.containsKey("polarLexiconGeneral")
                                || params.containsKey("polarLexiconDomain")) {
                            checkPolarityLexicons(ng, values, tokNum, polNgrams);
                        } //end polarity ngram checker
                    } //end ngram checking                                      
                }
                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }
                    posNgrams.add(t.getPos());

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            while (!ngrams.isEmpty()) {
                String ng = featureFromArray(ngrams, "lemma");

                //if the current lemma is in the ngram list activate the feature in the vector
                if (rsltdata.attribute(ng) != null) {
                    addNumericToFeatureVector(ng, values, 1); //tokNum
                }

                // polarity lexicons
                if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
                    checkPolarityLexicons(ng, values, tokNum, polNgrams);
                } //end polarity ngram checker

                ngrams.removeFirst();
            }

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId));
            if (pol != null && !pol.isEmpty()) {
                //System.err.println("polarity: _"+pol+"_");
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstances() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }
    return rsltdata;
}

From source file:net.spfbl.spf.SPF.java

/**
 * Merge nas listas de fixao de SPF./*from   w  w  w  .  j ava 2  s  .  co  m*/
 *
 * @param midleList lista dos mecanismos centrais.
 * @param errorList lista dos mecanismos com erro de sintaxe.
 */
private static void mergeMechanism(LinkedList<String> midleList, LinkedList<String> errorList) {
    while (!errorList.isEmpty()) {
        boolean fixed = false;
        if (errorList.size() > 1) {
            for (int index = 1; index < errorList.size(); index++) {
                String tokenFix = errorList.getFirst();
                for (String tokenError : errorList.subList(1, index + 1)) {
                    tokenFix += tokenError;
                }
                if (isMechanismMiddle(tokenFix)) {
                    midleList.add(tokenFix);
                    int k = 0;
                    while (k++ <= index) {
                        errorList.removeFirst();
                    }
                    fixed = true;
                    break;
                }
            }

        }
        if (!fixed) {
            // No foi capaz de corrigir o erro.
            midleList.add(errorList.removeFirst());
        }
    }
}

From source file:org.apache.hadoop.tracing.TraceAdmin.java

@Override
public int run(String argv[]) throws Exception {
    LinkedList<String> args = new LinkedList<String>();
    for (String arg : argv) {
        args.add(arg);//from ww  w.  j a v a  2s.  c  o m
    }
    if (StringUtils.popOption("-h", args) || StringUtils.popOption("-help", args)) {
        usage();
        return 0;
    } else if (args.size() == 0) {
        usage();
        return 0;
    }
    String hostPort = StringUtils.popOptionWithArgument("-host", args);
    if (hostPort == null) {
        System.err.println("You must specify a host with -host.");
        return 1;
    }
    if (args.size() < 0) {
        System.err.println("You must specify an operation.");
        return 1;
    }
    RPC.setProtocolEngine(getConf(), TraceAdminProtocolPB.class, ProtobufRpcEngine.class);
    InetSocketAddress address = NetUtils.createSocketAddr(hostPort);
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    Class<?> xface = TraceAdminProtocolPB.class;
    proxy = (TraceAdminProtocolPB) RPC.getProxy(xface, RPC.getProtocolVersion(xface), address, ugi, getConf(),
            NetUtils.getDefaultSocketFactory(getConf()), 0);
    remote = new TraceAdminProtocolTranslatorPB(proxy);
    try {
        if (args.get(0).equals("-list")) {
            return listSpanReceivers(args.subList(1, args.size()));
        } else if (args.get(0).equals("-add")) {
            return addSpanReceiver(args.subList(1, args.size()));
        } else if (args.get(0).equals("-remove")) {
            return removeSpanReceiver(args.subList(1, args.size()));
        } else {
            System.err.println("Unrecognized tracing command: " + args.get(0));
            System.err.println("Use -help for help.");
            return 1;
        }
    } finally {
        remote.close();
    }
}

From source file:org.dataconservancy.ui.services.MockDcsConnector.java

/**
 * Performs an search.  The caller indicates the maximum number of results to return, and the offset within
 * the total number of results.//from www .  j  a va2 s . co  m
 * <p/>
 * Because there is no concrete search implementation backing the mock connector, the query semantics are
 * determined by parsing the query string, and the query itself is emulated. Currently, the following kinds
 * of searches are supported:
 * <ul>
 * <li>ancestry search - find entities that share a common ancestor</li>
 * <li>identity search - find an entity with a specific id</li>
 * <li>parent search - find child Deliverable Units of a parent DU</li>
 * </ul>
 *
 * @param query the query string
 * @return an iterator over the search results
 * @throws DcsConnectorFault
 */
public CountableIterator<DcsEntity> search(String query, int maxResults, int offset) throws DcsConnectorFault {

    LinkedList<DcsEntity> result = new LinkedList<DcsEntity>();

    // Grab id
    String archive_id = null;
    if (query.contains("parent:")) {
        archive_id = query.substring(query.indexOf("parent:") + "parent:".length() + 1, query.length() - 2);
    } else {
        int i = query.indexOf('\"');
        archive_id = query.substring(i + 1, query.indexOf('\"', i + 1));
    }

    // TODO unescape solr syntax correctly
    archive_id = archive_id.replace("\\", "");

    if (query.contains(" OR ") && !query.contains("former:")) {
        // Assume sip recreation search

        // Find all the ancestors of archive_id
        performAncestrySearch(result, archive_id);

        // Add the common ancestor itself.
        DcsEntity e = archiveUtil.getEntity(archive_id);
        if (!result.contains(e)) {
            result.add(e);
        }

    } else if (query.startsWith("id:")) {
        // Assume id search
        if (archiveUtil.getEntity(archive_id) != null) {
            result.add(archiveUtil.getEntity(archive_id));
        }
    } else if (query.startsWith("ancestry:")) {
        // Assume ancestry search
        performAncestrySearch(result, archive_id);
    } else if (query.contains("parent")) {
        performParentSearch(result, archive_id);
    } else if (query.contains("former:")) {
        // example query we're handling:
        // ((entityType:"DeliverableUnit" AND former:"ed64f0fc\-8201\-47c0\-bdc9\-024078aaefbc" AND type:"root"))
        // OR ((entityType:"DeliverableUnit" AND former:"ed64f0fc\-8201\-47c0\-bdc9\-024078aaefbc" AND type:"state"))
        // another example query:
        // ((entityType:"DeliverableUnit" AND former:"id\://mooo" AND type:"root")) OR ((entityType:"DeliverableUnit" AND former:"id\://mooo" AND type:"state"))
        // another example:
        // (entityType:"DeliverableUnit" AND former:"http\://localhost\:8080/item/8" AND type:"org.dataconservancy\:types\:DataItem")

        Pattern p = Pattern.compile("^.*former:(\\S*)\\s.*$");
        Matcher m = p.matcher(query);
        if (m.find()) {
            String former_ref = m.group(1);
            former_ref = stripQuotes(former_ref);
            performFormerSearch(result, former_ref);
        } else {
            throw new RuntimeException(
                    "Unable to parse value for the 'former:' parameter from query string '" + query + "'");
        }

        LinkedList<DcsEntity> culledResults = new LinkedList<DcsEntity>();

        p = Pattern.compile("type:(\\S*)");
        m = p.matcher(query);
        if (!m.find()) {
            culledResults.addAll(result);
        }

        m = p.matcher(query);

        while (m.find()) {
            String type = stripQuotes(m.group(1));

            Iterator<DcsEntity> itr = result.iterator();
            while (itr.hasNext()) {
                DcsEntity entity = itr.next();
                if (!(entity instanceof DcsDeliverableUnit)) {
                    culledResults.add(entity);
                }

                if (type.equals(((DcsDeliverableUnit) entity).getType())) {
                    culledResults.add(entity);
                }
            }
        }

        result = culledResults;
    } else {
        throw new UnsupportedOperationException("Search not handled: " + query);
    }

    if (offset > 0 && result.size() > 0) {
        result.subList(0, offset).clear();
    }

    if (maxResults > 0 && result.size() > maxResults) {
        result.subList(maxResults, result.size()).clear();
    }

    return new MockSearchIterator(result);
}

From source file:org.trnltk.apps.morphology.contextless.parser.CachingMorphologicParserApp.java

@App("Parse sample TBMM Journal with bulk parse")
public void parseTbmmJournal_b0241h_withBulkParse() throws Exception {
    final File tokenizedFile = new File("core/src/test/resources/tokenizer/tbmm_b0241h_tokenized.txt");
    final List<String> lines = Files.readLines(tokenizedFile, Charsets.UTF_8);
    final LinkedList<String> words = new LinkedList<String>();
    final HashSet<String> uniqueWords = new HashSet<String>();
    for (String line : lines) {
        final ArrayList<String> strings = Lists
                .newArrayList(Splitter.on(" ").trimResults().omitEmptyStrings().split(line));
        words.addAll(strings);//from   w w w .j  a v a2  s .c om
        uniqueWords.addAll(strings);
    }

    final int initialL1CacheSize = uniqueWords.size();
    final int maxL1CacheSize = initialL1CacheSize;

    final MorphologicParserCache l1Cache = new LRUMorphologicParserCache(NUMBER_OF_THREADS, initialL1CacheSize,
            maxL1CacheSize);

    final ExecutorService pool = Executors.newFixedThreadPool(NUMBER_OF_THREADS);

    final MorphologicParser[] parsers = new MorphologicParser[NUMBER_OF_THREADS];
    for (int i = 0; i < parsers.length; i++) {
        parsers[i] = new CachingMorphologicParser(new TwoLevelMorphologicParserCache(BULK_SIZE, l1Cache),
                contextlessMorphologicParser, true);
    }

    final StopWatch stopWatch = new StopWatch();
    stopWatch.start();

    for (int i = 0; i < words.size(); i = i + BULK_SIZE) {
        final MorphologicParser parser = parsers[(i / BULK_SIZE) % NUMBER_OF_THREADS];
        int start = i;
        int end = i + BULK_SIZE < words.size() ? i + BULK_SIZE : words.size();
        final int wordIndex = i;

        final List<String> subWordList = words.subList(start, end);
        pool.execute(new BulkParseCommand(parser, subWordList, wordIndex, false));
    }

    pool.shutdown();
    while (!pool.isTerminated()) {
        System.out.println("Waiting pool to be terminated!");
        pool.awaitTermination(500, TimeUnit.MILLISECONDS);
    }

    stopWatch.stop();

    System.out.println("Total time :" + stopWatch.toString());
    System.out.println("Nr of tokens : " + words.size());
    System.out.println("Avg time : " + (stopWatch.getTime() * 1.0d) / (words.size() * 1.0d) + " ms");
}

From source file:org.trnltk.apps.morphology.contextless.parser.CachingMorphologicParserApp.java

@App("Parse all sample corpus. Does not do an offline analysis to add most frequent words to cache in advance.")
public void parse8MWords() throws Exception {
    /*/*from   w ww.  j  a  va  2  s .  com*/
     Total time :0:07:29.799
     Nr of tokens : 18362187
     Avg time : 0.024495938310616267 ms
    */
    final Set<File> files = SampleFiles.oneMillionSentencesTokenizedFiles();

    final LinkedList<String> words = new LinkedList<String>();
    final HashSet<String> uniqueWords = new HashSet<String>();

    for (File tokenizedFile : files) {
        final List<String> lines = Files.readLines(tokenizedFile, Charsets.UTF_8);
        for (String line : lines) {
            final ArrayList<String> strings = Lists
                    .newArrayList(Splitter.on(" ").trimResults().omitEmptyStrings().split(line));
            words.addAll(strings);
            uniqueWords.addAll(strings);
        }
    }

    System.out.println("Number of words : " + words.size());
    System.out.println("Number of unique words : " + uniqueWords.size());
    System.out.println("======================");

    final MorphologicParserCache l1Cache = new LRUMorphologicParserCache(NUMBER_OF_THREADS,
            INITIAL_L1_CACHE_SIZE, MAX_L1_CACHE_SIZE);

    final ThreadPoolExecutor pool = (ThreadPoolExecutor) Executors.newFixedThreadPool(NUMBER_OF_THREADS);

    final MorphologicParser[] parsers = new MorphologicParser[NUMBER_OF_THREADS];
    for (int i = 0; i < parsers.length; i++) {
        parsers[i] = new CachingMorphologicParser(new TwoLevelMorphologicParserCache(BULK_SIZE, l1Cache),
                contextlessMorphologicParser, true);
    }

    final StopWatch stopWatch = new StopWatch();
    stopWatch.start();

    for (int i = 0; i < words.size(); i = i + BULK_SIZE) {
        final MorphologicParser parser = parsers[(i / BULK_SIZE) % NUMBER_OF_THREADS];
        int start = i;
        int end = i + BULK_SIZE < words.size() ? i + BULK_SIZE : words.size();
        final List<String> subWordList = words.subList(start, end);
        final int wordIndex = i;
        pool.execute(new BulkParseCommand(parser, subWordList, wordIndex, false));
    }

    pool.shutdown();
    while (!pool.isTerminated()) {
        System.out.println("Waiting pool to be terminated!");
        pool.awaitTermination(1000, TimeUnit.MILLISECONDS);
    }

    stopWatch.stop();

    System.out.println("Total time :" + stopWatch.toString());
    System.out.println("Nr of tokens : " + words.size());
    System.out.println("Avg time : " + (stopWatch.getTime() * 1.0d) / (words.size() * 1.0d) + " ms");
}

From source file:org.trnltk.tokenizer.TokenizationGraph.java

public boolean isAddSpace(TextBlockGroup leftTextBlockGroup, TextBlockGroup rightTextBlockGroup,
        LinkedList<TextBlock> textBlocks, int currentBlockIndex) throws MissingTokenizationRuleException {
    final TokenizationGraphEdge edge = getRule(leftTextBlockGroup, rightTextBlockGroup, textBlocks,
            currentBlockIndex);//from   w ww. ja  va2 s.co  m
    if (edge == null) {
        int startIndex = Math.max(0, currentBlockIndex - CONTEXT_LENGTH);
        int endIndex = Math.min(textBlocks.size(), currentBlockIndex + CONTEXT_LENGTH);
        final TextBlockGroup contextBlockGroup = new TextBlockGroup(textBlocks.subList(startIndex, endIndex));
        final String leftTextBlockGroupStr = leftTextBlockGroup.toString().replace("\n", "\\n")
                .replace("\r", "\\r").replace("\t", "\\t");
        final String rightTextBlockGroupStr = rightTextBlockGroup.toString().replace("\n", "\\n")
                .replace("\r", "\\r").replace("\t", "\\t");
        throw new MissingTokenizationRuleException(leftTextBlockGroup, rightTextBlockGroup,
                "No rule found for \n\tleft : " + leftTextBlockGroupStr + "\n\tright " + rightTextBlockGroupStr,
                contextBlockGroup);
    }

    return edge.isAddSpace();
}

From source file:org.trnltk.tokenizer.TokenizationGraph.java

private TokenizationGraphEdge getRule(TextBlockGroup leftTextBlockGroup, TextBlockGroup rightTextBlockGroup,
        LinkedList<TextBlock> textBlocks, int currentBlockIndex) {
    final TextBlockTypeGroup leftTextBlockTypeGroup = leftTextBlockGroup.getTextBlockTypeGroup();
    final TextBlockTypeGroup rightTextBlockTypeGroup = rightTextBlockGroup.getTextBlockTypeGroup();

    final TokenizationGraphNode sourceNode = this.nodeMap.get(leftTextBlockTypeGroup);
    if (sourceNode == null) {
        int startIndex = Math.max(0, currentBlockIndex - CONTEXT_LENGTH);
        int endIndex = Math.min(textBlocks.size(), currentBlockIndex + CONTEXT_LENGTH);
        final TextBlockGroup contextBlockGroup = new TextBlockGroup(textBlocks.subList(startIndex, endIndex));
        final String leftTextBlockGroupStr = leftTextBlockGroup.toString().replace("\n", "\\n")
                .replace("\r", "\\r").replace("\t", "\\t");
        final String rightTextBlockGroupStr = rightTextBlockGroup.toString().replace("\n", "\\n")
                .replace("\r", "\\r").replace("\t", "\\t");
        throw new MissingTokenizationRuleException(leftTextBlockGroup, rightTextBlockGroup,
                "No source node found \n\tleft : " + leftTextBlockGroupStr + "\n\tright "
                        + rightTextBlockGroupStr,
                contextBlockGroup);/*from   w w w  .  ja va2 s .  co m*/
    }

    final TokenizationGraphNode targetNode = this.nodeMap.get(rightTextBlockTypeGroup);
    if (targetNode == null) {
        int startIndex = Math.max(0, currentBlockIndex - CONTEXT_LENGTH);
        int endIndex = Math.min(textBlocks.size(), currentBlockIndex + CONTEXT_LENGTH);
        final TextBlockGroup contextBlockGroup = new TextBlockGroup(textBlocks.subList(startIndex, endIndex));
        final String leftTextBlockGroupStr = leftTextBlockGroup.toString().replace("\n", "\\n")
                .replace("\r", "\\r").replace("\t", "\\t");
        final String rightTextBlockGroupStr = rightTextBlockGroup.toString().replace("\n", "\\n")
                .replace("\r", "\\r").replace("\t", "\\t");
        throw new MissingTokenizationRuleException(leftTextBlockGroup, rightTextBlockGroup,
                "No target node found \n\tleft : " + leftTextBlockGroupStr + "\n\tright "
                        + rightTextBlockGroupStr,
                contextBlockGroup);
    }

    return sourceNode.getEdge(rightTextBlockTypeGroup);
}