List of usage examples for java.util LinkedList subList
List<E> subList(int fromIndex, int toIndex);
From source file:elh.eus.absa.Features.java
/** * Extract n-grams up to a certain length from an Conll tabulated format string. * /* w w w . ja v a 2 s .c o m*/ * @param String input : input tagged conll string * @param int length : which 'n' use for 'n-grams' * @param string type (wf|lemma|pos): what type of ngrams we want to extract. * @param boolean save : safe ngrams to file or not. * @return int success: return 1 if the process ended correctly */ private int extractNgramsTABString(InputStream input, int length, String type, List<String> discardPos, boolean save) { //System.err.println("ngram extraction Tab: _"+length+"_"+type); if (length == 0) { return 0; } //System.err.println("ngram extraction, corpus sentences: "+corpus.getSentences().get(sent)); //String[] tokens = input.split("\n"); BufferedReader reader = new BufferedReader(new InputStreamReader(input)); LinkedList<String> ngrams = new LinkedList<String>(); String line; try { while ((line = reader.readLine()) != null) { String ngram = ""; String[] fields = line.split("\\s"); String pos = ""; switch (type) { case "wf": ngram = fields[0]; break; case "lemma": if (fields.length > 1) { ngram = fields[1]; } if (fields.length > 2) { pos = fields[2]; } break; case "pos": if (fields.length > 2) { ngram = fields[2]; switch (ngram.length()) { case 0: ngram = "-"; break; case 1: ngram = ngram.substring(0, 1); break; default: ngram = ngram.substring(0, 2); break; } } } //if the is a blank line we assume sentence has ended and we empty and re-initialize the n-gram list if (ngram.equals("")) { //empty n-gram list and add remaining n-grams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, type); addNgram(type, ng); ngrams.removeFirst(); } continue; } if (ngrams.size() >= length) { ngrams.removeFirst(); } //if no alphanumeric char is present discard the element as invalid ngram. Or if it has a PoS tag that //should be discarded String lCurrent = ngram; if ((!discardPos.contains(pos)) && (!ngram.matches("^[^\\p{L}\\p{M}\\p{Nd}\\p{InEmoticons}]+$")) && (lCurrent.length() > 1)) { //standarize numeric values to NUMNUM lemma value //ngram.replaceFirst("^[0-9]$", "NUMNUM"); if (!type.equalsIgnoreCase("pos")) { ngrams.add(normalize(ngram, params.getProperty("normalization", "none"))); } else { ngrams.add(ngram); } } //certain punctuation marks are allowed as lemmas else if ((lCurrent.length() < 2) && (lCurrent.matches("[,;.?!]"))) { ngrams.add(lCurrent); } // add ngrams to the feature list for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), type); addNgram(type, ng); } } } catch (IOException e) { System.err.println("EliXa::Features::extractNgramsTABString - WARNING: Error reading tagged file, " + "ngram extraction may be only partial\n"); } //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, type); addNgram(type, ng); ngrams.removeFirst(); } return 1; }
From source file:elh.eus.absa.Features.java
/** * Function fills the attribute vectors for the instances existing in the corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * // w w w .j a va 2 s . co m * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstances(boolean save, String prefix) throws IOException { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //Properties posProp = new Properties(); //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); if (params.containsKey("lemmaNgrams")) { Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"), corpus.getLang(), "3", "bin", "false"); postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; // string normalization (emoticons, twitter grammar,...) String opNormalized = corpus.getOpinionSentence(oId); // compute uppercase ratio before normalization (if needed) double upRatio = 0.0; if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) { String upper = opNormalized.replaceAll("[\\p{Ll}]", ""); upRatio = (double) upper.length() / (double) opNormalized.length(); values[rsltdata.attribute("upperCaseRation").index()] = upRatio; } // string normalization (emoticons, twitter grammar,...) if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams")) && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) { opNormalized = normalize(opNormalized, params.getProperty("normalization", "none")); } //process the current instance with the NLP pipeline in order to get token and lemma|pos features KAFDocument nafinst = new KAFDocument("", ""); String nafname = trainExamples.get(oId).getsId().replace(':', '_'); String nafDir = params.getProperty("kafDir"); String nafPath = nafDir + File.separator + nafname + ".kaf"; //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = 1; try { if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty())) { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(), params.getProperty("pos-model"), postagger); Files.createDirectories(Paths.get(nafDir)); nafinst.save(nafPath); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } else { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang()); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } } catch (IOException | JDOMException e) { System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId + "|" + oId + ") for filling the attribute vector"); e.printStackTrace(); System.exit(5); } LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } List<WF> window = nafinst.getWFs(); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer to = window.size(); Integer from = 0; end++; for (int i = 0; i < window.size(); i++) { WF wf = window.get(i); if ((wf.getOffset() == start) && (i >= bowWin)) { from = i - bowWin; } else if (wf.getOffset() >= end) { if (i + bowWin < window.size()) { to = i + bowWin; } break; } } window = window.subList(from, to); //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); List<String> windowWFIds = new ArrayList<String>(); // word form ngram related features for (WF wf : window) { windowWFIds.add(wf.getId()); String wfStr = wf.getForm(); if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (Term t : nafinst.getTermsFromWFs(windowWFIds)) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } String lemma = t.getLemma(); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // add ngrams to the feature vector for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { Attribute ngAtt = rsltdata.attribute(ng); if (ngAtt != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } } ng = featureFromArray(ngrams.subList(0, i + 1), ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker } //end ngram checking } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } posNgrams.add(t.getPos()); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (rsltdata.attribute(ng) != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } // polarity lexicons if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker ngrams.removeFirst(); } //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId)); if (pol != null && !pol.isEmpty()) { //System.err.println("polarity: _"+pol+"_"); inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstances() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }
From source file:net.spfbl.spf.SPF.java
/** * Merge nas listas de fixao de SPF./*from w w w . j ava 2 s . co m*/ * * @param midleList lista dos mecanismos centrais. * @param errorList lista dos mecanismos com erro de sintaxe. */ private static void mergeMechanism(LinkedList<String> midleList, LinkedList<String> errorList) { while (!errorList.isEmpty()) { boolean fixed = false; if (errorList.size() > 1) { for (int index = 1; index < errorList.size(); index++) { String tokenFix = errorList.getFirst(); for (String tokenError : errorList.subList(1, index + 1)) { tokenFix += tokenError; } if (isMechanismMiddle(tokenFix)) { midleList.add(tokenFix); int k = 0; while (k++ <= index) { errorList.removeFirst(); } fixed = true; break; } } } if (!fixed) { // No foi capaz de corrigir o erro. midleList.add(errorList.removeFirst()); } } }
From source file:org.apache.hadoop.tracing.TraceAdmin.java
@Override public int run(String argv[]) throws Exception { LinkedList<String> args = new LinkedList<String>(); for (String arg : argv) { args.add(arg);//from ww w. j a v a 2s. c o m } if (StringUtils.popOption("-h", args) || StringUtils.popOption("-help", args)) { usage(); return 0; } else if (args.size() == 0) { usage(); return 0; } String hostPort = StringUtils.popOptionWithArgument("-host", args); if (hostPort == null) { System.err.println("You must specify a host with -host."); return 1; } if (args.size() < 0) { System.err.println("You must specify an operation."); return 1; } RPC.setProtocolEngine(getConf(), TraceAdminProtocolPB.class, ProtobufRpcEngine.class); InetSocketAddress address = NetUtils.createSocketAddr(hostPort); UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); Class<?> xface = TraceAdminProtocolPB.class; proxy = (TraceAdminProtocolPB) RPC.getProxy(xface, RPC.getProtocolVersion(xface), address, ugi, getConf(), NetUtils.getDefaultSocketFactory(getConf()), 0); remote = new TraceAdminProtocolTranslatorPB(proxy); try { if (args.get(0).equals("-list")) { return listSpanReceivers(args.subList(1, args.size())); } else if (args.get(0).equals("-add")) { return addSpanReceiver(args.subList(1, args.size())); } else if (args.get(0).equals("-remove")) { return removeSpanReceiver(args.subList(1, args.size())); } else { System.err.println("Unrecognized tracing command: " + args.get(0)); System.err.println("Use -help for help."); return 1; } } finally { remote.close(); } }
From source file:org.dataconservancy.ui.services.MockDcsConnector.java
/** * Performs an search. The caller indicates the maximum number of results to return, and the offset within * the total number of results.//from www . j a va2 s . co m * <p/> * Because there is no concrete search implementation backing the mock connector, the query semantics are * determined by parsing the query string, and the query itself is emulated. Currently, the following kinds * of searches are supported: * <ul> * <li>ancestry search - find entities that share a common ancestor</li> * <li>identity search - find an entity with a specific id</li> * <li>parent search - find child Deliverable Units of a parent DU</li> * </ul> * * @param query the query string * @return an iterator over the search results * @throws DcsConnectorFault */ public CountableIterator<DcsEntity> search(String query, int maxResults, int offset) throws DcsConnectorFault { LinkedList<DcsEntity> result = new LinkedList<DcsEntity>(); // Grab id String archive_id = null; if (query.contains("parent:")) { archive_id = query.substring(query.indexOf("parent:") + "parent:".length() + 1, query.length() - 2); } else { int i = query.indexOf('\"'); archive_id = query.substring(i + 1, query.indexOf('\"', i + 1)); } // TODO unescape solr syntax correctly archive_id = archive_id.replace("\\", ""); if (query.contains(" OR ") && !query.contains("former:")) { // Assume sip recreation search // Find all the ancestors of archive_id performAncestrySearch(result, archive_id); // Add the common ancestor itself. DcsEntity e = archiveUtil.getEntity(archive_id); if (!result.contains(e)) { result.add(e); } } else if (query.startsWith("id:")) { // Assume id search if (archiveUtil.getEntity(archive_id) != null) { result.add(archiveUtil.getEntity(archive_id)); } } else if (query.startsWith("ancestry:")) { // Assume ancestry search performAncestrySearch(result, archive_id); } else if (query.contains("parent")) { performParentSearch(result, archive_id); } else if (query.contains("former:")) { // example query we're handling: // ((entityType:"DeliverableUnit" AND former:"ed64f0fc\-8201\-47c0\-bdc9\-024078aaefbc" AND type:"root")) // OR ((entityType:"DeliverableUnit" AND former:"ed64f0fc\-8201\-47c0\-bdc9\-024078aaefbc" AND type:"state")) // another example query: // ((entityType:"DeliverableUnit" AND former:"id\://mooo" AND type:"root")) OR ((entityType:"DeliverableUnit" AND former:"id\://mooo" AND type:"state")) // another example: // (entityType:"DeliverableUnit" AND former:"http\://localhost\:8080/item/8" AND type:"org.dataconservancy\:types\:DataItem") Pattern p = Pattern.compile("^.*former:(\\S*)\\s.*$"); Matcher m = p.matcher(query); if (m.find()) { String former_ref = m.group(1); former_ref = stripQuotes(former_ref); performFormerSearch(result, former_ref); } else { throw new RuntimeException( "Unable to parse value for the 'former:' parameter from query string '" + query + "'"); } LinkedList<DcsEntity> culledResults = new LinkedList<DcsEntity>(); p = Pattern.compile("type:(\\S*)"); m = p.matcher(query); if (!m.find()) { culledResults.addAll(result); } m = p.matcher(query); while (m.find()) { String type = stripQuotes(m.group(1)); Iterator<DcsEntity> itr = result.iterator(); while (itr.hasNext()) { DcsEntity entity = itr.next(); if (!(entity instanceof DcsDeliverableUnit)) { culledResults.add(entity); } if (type.equals(((DcsDeliverableUnit) entity).getType())) { culledResults.add(entity); } } } result = culledResults; } else { throw new UnsupportedOperationException("Search not handled: " + query); } if (offset > 0 && result.size() > 0) { result.subList(0, offset).clear(); } if (maxResults > 0 && result.size() > maxResults) { result.subList(maxResults, result.size()).clear(); } return new MockSearchIterator(result); }
From source file:org.trnltk.apps.morphology.contextless.parser.CachingMorphologicParserApp.java
@App("Parse sample TBMM Journal with bulk parse") public void parseTbmmJournal_b0241h_withBulkParse() throws Exception { final File tokenizedFile = new File("core/src/test/resources/tokenizer/tbmm_b0241h_tokenized.txt"); final List<String> lines = Files.readLines(tokenizedFile, Charsets.UTF_8); final LinkedList<String> words = new LinkedList<String>(); final HashSet<String> uniqueWords = new HashSet<String>(); for (String line : lines) { final ArrayList<String> strings = Lists .newArrayList(Splitter.on(" ").trimResults().omitEmptyStrings().split(line)); words.addAll(strings);//from w w w .j a v a2 s .c om uniqueWords.addAll(strings); } final int initialL1CacheSize = uniqueWords.size(); final int maxL1CacheSize = initialL1CacheSize; final MorphologicParserCache l1Cache = new LRUMorphologicParserCache(NUMBER_OF_THREADS, initialL1CacheSize, maxL1CacheSize); final ExecutorService pool = Executors.newFixedThreadPool(NUMBER_OF_THREADS); final MorphologicParser[] parsers = new MorphologicParser[NUMBER_OF_THREADS]; for (int i = 0; i < parsers.length; i++) { parsers[i] = new CachingMorphologicParser(new TwoLevelMorphologicParserCache(BULK_SIZE, l1Cache), contextlessMorphologicParser, true); } final StopWatch stopWatch = new StopWatch(); stopWatch.start(); for (int i = 0; i < words.size(); i = i + BULK_SIZE) { final MorphologicParser parser = parsers[(i / BULK_SIZE) % NUMBER_OF_THREADS]; int start = i; int end = i + BULK_SIZE < words.size() ? i + BULK_SIZE : words.size(); final int wordIndex = i; final List<String> subWordList = words.subList(start, end); pool.execute(new BulkParseCommand(parser, subWordList, wordIndex, false)); } pool.shutdown(); while (!pool.isTerminated()) { System.out.println("Waiting pool to be terminated!"); pool.awaitTermination(500, TimeUnit.MILLISECONDS); } stopWatch.stop(); System.out.println("Total time :" + stopWatch.toString()); System.out.println("Nr of tokens : " + words.size()); System.out.println("Avg time : " + (stopWatch.getTime() * 1.0d) / (words.size() * 1.0d) + " ms"); }
From source file:org.trnltk.apps.morphology.contextless.parser.CachingMorphologicParserApp.java
@App("Parse all sample corpus. Does not do an offline analysis to add most frequent words to cache in advance.") public void parse8MWords() throws Exception { /*/*from w ww. j a va 2 s . com*/ Total time :0:07:29.799 Nr of tokens : 18362187 Avg time : 0.024495938310616267 ms */ final Set<File> files = SampleFiles.oneMillionSentencesTokenizedFiles(); final LinkedList<String> words = new LinkedList<String>(); final HashSet<String> uniqueWords = new HashSet<String>(); for (File tokenizedFile : files) { final List<String> lines = Files.readLines(tokenizedFile, Charsets.UTF_8); for (String line : lines) { final ArrayList<String> strings = Lists .newArrayList(Splitter.on(" ").trimResults().omitEmptyStrings().split(line)); words.addAll(strings); uniqueWords.addAll(strings); } } System.out.println("Number of words : " + words.size()); System.out.println("Number of unique words : " + uniqueWords.size()); System.out.println("======================"); final MorphologicParserCache l1Cache = new LRUMorphologicParserCache(NUMBER_OF_THREADS, INITIAL_L1_CACHE_SIZE, MAX_L1_CACHE_SIZE); final ThreadPoolExecutor pool = (ThreadPoolExecutor) Executors.newFixedThreadPool(NUMBER_OF_THREADS); final MorphologicParser[] parsers = new MorphologicParser[NUMBER_OF_THREADS]; for (int i = 0; i < parsers.length; i++) { parsers[i] = new CachingMorphologicParser(new TwoLevelMorphologicParserCache(BULK_SIZE, l1Cache), contextlessMorphologicParser, true); } final StopWatch stopWatch = new StopWatch(); stopWatch.start(); for (int i = 0; i < words.size(); i = i + BULK_SIZE) { final MorphologicParser parser = parsers[(i / BULK_SIZE) % NUMBER_OF_THREADS]; int start = i; int end = i + BULK_SIZE < words.size() ? i + BULK_SIZE : words.size(); final List<String> subWordList = words.subList(start, end); final int wordIndex = i; pool.execute(new BulkParseCommand(parser, subWordList, wordIndex, false)); } pool.shutdown(); while (!pool.isTerminated()) { System.out.println("Waiting pool to be terminated!"); pool.awaitTermination(1000, TimeUnit.MILLISECONDS); } stopWatch.stop(); System.out.println("Total time :" + stopWatch.toString()); System.out.println("Nr of tokens : " + words.size()); System.out.println("Avg time : " + (stopWatch.getTime() * 1.0d) / (words.size() * 1.0d) + " ms"); }
From source file:org.trnltk.tokenizer.TokenizationGraph.java
public boolean isAddSpace(TextBlockGroup leftTextBlockGroup, TextBlockGroup rightTextBlockGroup, LinkedList<TextBlock> textBlocks, int currentBlockIndex) throws MissingTokenizationRuleException { final TokenizationGraphEdge edge = getRule(leftTextBlockGroup, rightTextBlockGroup, textBlocks, currentBlockIndex);//from w ww. ja va2 s.co m if (edge == null) { int startIndex = Math.max(0, currentBlockIndex - CONTEXT_LENGTH); int endIndex = Math.min(textBlocks.size(), currentBlockIndex + CONTEXT_LENGTH); final TextBlockGroup contextBlockGroup = new TextBlockGroup(textBlocks.subList(startIndex, endIndex)); final String leftTextBlockGroupStr = leftTextBlockGroup.toString().replace("\n", "\\n") .replace("\r", "\\r").replace("\t", "\\t"); final String rightTextBlockGroupStr = rightTextBlockGroup.toString().replace("\n", "\\n") .replace("\r", "\\r").replace("\t", "\\t"); throw new MissingTokenizationRuleException(leftTextBlockGroup, rightTextBlockGroup, "No rule found for \n\tleft : " + leftTextBlockGroupStr + "\n\tright " + rightTextBlockGroupStr, contextBlockGroup); } return edge.isAddSpace(); }
From source file:org.trnltk.tokenizer.TokenizationGraph.java
private TokenizationGraphEdge getRule(TextBlockGroup leftTextBlockGroup, TextBlockGroup rightTextBlockGroup, LinkedList<TextBlock> textBlocks, int currentBlockIndex) { final TextBlockTypeGroup leftTextBlockTypeGroup = leftTextBlockGroup.getTextBlockTypeGroup(); final TextBlockTypeGroup rightTextBlockTypeGroup = rightTextBlockGroup.getTextBlockTypeGroup(); final TokenizationGraphNode sourceNode = this.nodeMap.get(leftTextBlockTypeGroup); if (sourceNode == null) { int startIndex = Math.max(0, currentBlockIndex - CONTEXT_LENGTH); int endIndex = Math.min(textBlocks.size(), currentBlockIndex + CONTEXT_LENGTH); final TextBlockGroup contextBlockGroup = new TextBlockGroup(textBlocks.subList(startIndex, endIndex)); final String leftTextBlockGroupStr = leftTextBlockGroup.toString().replace("\n", "\\n") .replace("\r", "\\r").replace("\t", "\\t"); final String rightTextBlockGroupStr = rightTextBlockGroup.toString().replace("\n", "\\n") .replace("\r", "\\r").replace("\t", "\\t"); throw new MissingTokenizationRuleException(leftTextBlockGroup, rightTextBlockGroup, "No source node found \n\tleft : " + leftTextBlockGroupStr + "\n\tright " + rightTextBlockGroupStr, contextBlockGroup);/*from w w w . ja va2 s . co m*/ } final TokenizationGraphNode targetNode = this.nodeMap.get(rightTextBlockTypeGroup); if (targetNode == null) { int startIndex = Math.max(0, currentBlockIndex - CONTEXT_LENGTH); int endIndex = Math.min(textBlocks.size(), currentBlockIndex + CONTEXT_LENGTH); final TextBlockGroup contextBlockGroup = new TextBlockGroup(textBlocks.subList(startIndex, endIndex)); final String leftTextBlockGroupStr = leftTextBlockGroup.toString().replace("\n", "\\n") .replace("\r", "\\r").replace("\t", "\\t"); final String rightTextBlockGroupStr = rightTextBlockGroup.toString().replace("\n", "\\n") .replace("\r", "\\r").replace("\t", "\\t"); throw new MissingTokenizationRuleException(leftTextBlockGroup, rightTextBlockGroup, "No target node found \n\tleft : " + leftTextBlockGroupStr + "\n\tright " + rightTextBlockGroupStr, contextBlockGroup); } return sourceNode.getEdge(rightTextBlockTypeGroup); }