List of usage examples for java.util LinkedList size
int size
To view the source code for java.util LinkedList size.
Click Source Link
From source file:eu.aniketos.scpm.impl.CompositionPlanner.java
private List<ICompositionPlan> createCompositionPlans(ICompositionPlan serviceSpecification, Map<String, Set<Service>> mapTaskServices) { //Creation of several composition plans combining the service locations if (logger.isDebugEnabled()) { logger.debug("Creating composition Plans for process " + serviceSpecification.getCompositionPlanID()); //$NON-NLS-1$ }//w w w .ja v a2s. c o m Set<String> taskIds = mapTaskServices.keySet(); Object[] taskIdsArray = (Object[]) taskIds.toArray(); LinkedList<ICompositionPlan> listCompositionPlan = new LinkedList<ICompositionPlan>(); XMLOutputter xmlOutput = new XMLOutputter(); xmlOutput.setFormat(Format.getPrettyFormat()); Iterator<String> itTaskIds = taskIds.iterator(); List<Set<Service>> list = new LinkedList<Set<Service>>(); while (itTaskIds.hasNext()) { String taskId = itTaskIds.next(); Set<Service> services = mapTaskServices.get(taskId); Set<String> locations = new HashSet<String>(); for (Service service : services) { locations.add(service.getLocation()); } list.add(services); } Set<List<Service>> set = Sets.cartesianProduct(list); Object[] setArray = (Object[]) set.toArray(); for (int j = 0; j < setArray.length; j++) { Document newDocument = BPMNParser.getDocument(serviceSpecification.getBPMNXML()); // List<String> locationsParsed = new Vector<String>(); @SuppressWarnings("unchecked") List<Object> listS = (List<Object>) setArray[j]; for (int i = taskIdsArray.length - 1; i >= 0; i--) { String taskId = (String) taskIdsArray[i]; // String taskId = serviceIds.get(i); Service service = (Service) listS.get(i); String location = service.getLocation(); // if(!locationsParsed.contains(location)){ // String portTypeName = WsdlParser.getInstance().getPortTypeName(location); // Element interfaceElement = XMLParser.addPortTypeName(newDocument, portTypeName); // Map<String,List<String>> hashOperationsData = WsdlParser.getInstance().getOperationsData(location); // Set<String> setOperations = hashOperationsData.keySet(); // for(String operation : setOperations){ // List<String> listInputOutput = hashOperationsData.get(operation); // XMLParser.addOperation(newDocument, interfaceElement, portTypeName, operation, listInputOutput); // } // locationsParsed.add(location); // } String serviceId = service.getServiceId(); logger.debug("ServiceId: " + serviceId); String provider = service.getProvider(); BPMNParser.addServiceId(newDocument, taskId, serviceId); BPMNParser.addLocationField(newDocument, taskId, location); BPMNParser.addProviderField(newDocument, taskId, provider); } int c = j + 1; BPMNParser.addProcessId(newDocument, "compositionPlan" + c); logger.debug("Created compositon plan" + c); String specification = xmlOutput.outputString(newDocument); ICompositionPlan compositionPlan = new CompositionPlan(specification); compositionPlan.setCompositionPlanID("compositionPlan" + c); //Interaction with the composition planner listCompositionPlan.add(compositionPlan); logger.debug("compositionPlan " + compositionPlan.getBPMNXML()); } logger.info("Created " + listCompositionPlan.size() + " Composition Plans"); return listCompositionPlan; }
From source file:com.buaa.cfs.conf.Configuration.java
private Resource loadResource(Properties properties, Resource wrapper, boolean quiet) { String name = UNKNOWN_RESOURCE; try {/*from ww w . j a va2s. c om*/ Object resource = wrapper.getResource(); name = wrapper.getName(); DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); //ignore all comments inside the xml file docBuilderFactory.setIgnoringComments(true); //allow includes in the xml file docBuilderFactory.setNamespaceAware(true); try { docBuilderFactory.setXIncludeAware(true); } catch (UnsupportedOperationException e) { LOG.error("Failed to set setXIncludeAware(true) for parser " + docBuilderFactory + ":" + e, e); } DocumentBuilder builder = docBuilderFactory.newDocumentBuilder(); Document doc = null; Element root = null; boolean returnCachedProperties = false; if (resource instanceof URL) { // an URL resource doc = parse(builder, (URL) resource); } else if (resource instanceof String) { // a CLASSPATH resource URL url = getResource((String) resource); doc = parse(builder, url); } else if (resource instanceof Path) { // a file resource // Can't use FileSystem API or we get an infinite loop // since FileSystem uses Configuration API. Use java.io.File instead. File file = new File(((Path) resource).toUri().getPath()).getAbsoluteFile(); if (file.exists()) { if (!quiet) { LOG.debug("parsing File " + file); } doc = parse(builder, new BufferedInputStream(new FileInputStream(file)), ((Path) resource).toString()); } } else if (resource instanceof InputStream) { doc = parse(builder, (InputStream) resource, null); returnCachedProperties = true; } else if (resource instanceof Properties) { overlay(properties, (Properties) resource); } else if (resource instanceof Element) { root = (Element) resource; } if (root == null) { if (doc == null) { if (quiet) { return null; } throw new RuntimeException(resource + " not found"); } root = doc.getDocumentElement(); } Properties toAddTo = properties; if (returnCachedProperties) { toAddTo = new Properties(); } if (!"configuration".equals(root.getTagName())) LOG.fatal("bad conf file: top-level element not <configuration>"); NodeList props = root.getChildNodes(); DeprecationContext deprecations = deprecationContext.get(); for (int i = 0; i < props.getLength(); i++) { Node propNode = props.item(i); if (!(propNode instanceof Element)) continue; Element prop = (Element) propNode; if ("configuration".equals(prop.getTagName())) { loadResource(toAddTo, new Resource(prop, name), quiet); continue; } if (!"property".equals(prop.getTagName())) LOG.warn("bad conf file: element not <property>"); NodeList fields = prop.getChildNodes(); String attr = null; String value = null; boolean finalParameter = false; LinkedList<String> source = new LinkedList<String>(); for (int j = 0; j < fields.getLength(); j++) { Node fieldNode = fields.item(j); if (!(fieldNode instanceof Element)) continue; Element field = (Element) fieldNode; if ("name".equals(field.getTagName()) && field.hasChildNodes()) attr = StringInterner.weakIntern(((Text) field.getFirstChild()).getData().trim()); if ("value".equals(field.getTagName()) && field.hasChildNodes()) value = StringInterner.weakIntern(((Text) field.getFirstChild()).getData()); if ("final".equals(field.getTagName()) && field.hasChildNodes()) finalParameter = "true".equals(((Text) field.getFirstChild()).getData()); if ("source".equals(field.getTagName()) && field.hasChildNodes()) source.add(StringInterner.weakIntern(((Text) field.getFirstChild()).getData())); } source.add(name); // Ignore this parameter if it has already been marked as 'final' if (attr != null) { if (deprecations.getDeprecatedKeyMap().containsKey(attr)) { DeprecatedKeyInfo keyInfo = deprecations.getDeprecatedKeyMap().get(attr); keyInfo.clearAccessed(); for (String key : keyInfo.newKeys) { // update new keys with deprecated key's value loadProperty(toAddTo, name, key, value, finalParameter, source.toArray(new String[source.size()])); } } else { loadProperty(toAddTo, name, attr, value, finalParameter, source.toArray(new String[source.size()])); } } } if (returnCachedProperties) { overlay(properties, toAddTo); return new Resource(toAddTo, name); } return null; } catch (IOException e) { LOG.fatal("error parsing conf " + name, e); throw new RuntimeException(e); } catch (DOMException e) { LOG.fatal("error parsing conf " + name, e); throw new RuntimeException(e); } catch (SAXException e) { LOG.fatal("error parsing conf " + name, e); throw new RuntimeException(e); } catch (ParserConfigurationException e) { LOG.fatal("error parsing conf " + name, e); throw new RuntimeException(e); } }
From source file:cellularAutomata.analysis.PricingDistributionAnalysis.java
/** * Plots the pricing data as a time series. *//*from w w w.j av a 2s . c o m*/ private void plotTimeSeriesData() { for (int i = 0; i < numberOfStates; i++) { // create a list of all data points (exclude 0 if necessary) LinkedList<Point2D.Double> allPoints = new LinkedList<Point2D.Double>(); int startPosition = 0; if (!plotZeroState) { startPosition = 1; } allPoints.addAll(timeSeriesOfDistributionList[i]); // set the min and max values on the plot Point2D firstPoint = (Point2D) timeSeriesOfDistributionList[i].getFirst(); timeSeriesPlot[i].setMaximumXValue(firstPoint.getX() + MAX_NUMBER_TO_PLOT - 1); timeSeriesPlot[i].setMinimumXValue(0); timeSeriesPlot[i].setMinimumYValue(0.0); timeSeriesPlot[i].setXAxisLabel("Histogram"); timeSeriesPlot[i].setYAxisLabel("% bin"); timeSeriesPlot[i].showPlotLines(false); // set the max y-value double maxYValue = 0.0; Iterator<Point2D.Double> iterator = allPoints.iterator(); while (iterator.hasNext()) { Point2D.Double point = iterator.next(); if (point.y > maxYValue) { maxYValue = point.y; } } // now round up to the nearest tenth and add 0.1. This gives some // wiggle room before the plot will have to redraw the y-axis. If // redraw the y-axis too often, it will look bad. This crazy case // statement ensures that we don't get something like 0.7999999 // instead // of 0.8 (which is what was happening when I would divide by 10.0 // using // a formula to calculate the maxYValue). switch ((int) Math.ceil(maxYValue * 10.0)) { case 0: maxYValue = 0.1; break; case 1: maxYValue = 0.2; break; case 2: maxYValue = 0.3; break; case 3: maxYValue = 0.4; break; case 4: maxYValue = 0.5; break; case 5: maxYValue = 0.6; break; case 6: maxYValue = 0.7; break; case 7: maxYValue = 0.8; break; case 8: maxYValue = 0.9; break; case 9: maxYValue = 1.0; break; case 10: maxYValue = 1.0; break; case 11: maxYValue = 1.0; break; case 12: maxYValue = 1.0; break; } if (maxYValue > 1.0) { maxYValue = 1.0; } maxYValue = maxYValue * 1; timeSeriesPlot[i].setMaximumYValue(maxYValue); // draw some extra points on the y axes (looks good) int numberOfInteriorYValueLabels = (int) ((maxYValue / 20)); double[] yValues = new double[numberOfInteriorYValueLabels]; for (int j = 0; j < yValues.length; j++) { double answer = (j * 20); yValues[j] = answer; } timeSeriesPlot[i].setExtraYAxisValues(yValues); // specify colors for the points Color[] colorArray = new Color[allPoints.size()]; CellStateView view = Cell.getView(); Color stateColor = view.getDisplayColor(new IntegerCellState(i), null, new Coordinate(0, 0)); for (int j = 0; j < timeSeriesOfDistributionList[i].size(); j++) { colorArray[j] = stateColor; } timeSeriesPlot[i].setPointDisplayColors(colorArray); // draw the points! timeSeriesPlot[i].drawPoints(allPoints); } }
From source file:gedi.riboseq.inference.orf.OrfFinder.java
/** * Coordinates are in codonsRegion space! * @param index/*from ww w .j a v a 2 s. c o m*/ * @param sequence * @param sg * @param codonsRegion * @return */ public ArrayList<OrfWithCodons> findOrfs(int index, String sequence, SpliceGraph sg, ImmutableReferenceGenomicRegion<IntervalTreeSet<Codon>> codonsRegion) { SimpleDirectedGraph<Codon> fg = new SimpleDirectedGraph<Codon>("Codongraph"); // if (!codonsRegion.getReference().toString().equals("chr4+") || !codonsRegion.getRegion().contains(140_283_087)) // return 0; LeftMostInFrameAndClearList buff = new LeftMostInFrameAndClearList(); IntervalTreeSet<Codon> codons = codonsRegion.getData(); codons.removeIf(c -> c.getTotalActivity() < minCodonActivity); if (codons.size() == 0) return new ArrayList<OrfWithCodons>(); // add stop codons for easy orf inference HashSet<Codon> stopCodons = new HashSet<Codon>(); Trie<String> stop = new Trie<String>(); stop.put("TAG", "TAG"); stop.put("TGA", "TGA"); stop.put("TAA", "TAA"); stop.iterateAhoCorasick(sequence) .map(r -> new Codon(new ArrayGenomicRegion(r.getStart(), r.getEnd()), r.getValue())) .toCollection(stopCodons); for (Intron intr : sg.iterateIntrons().loop()) { ArrayGenomicRegion reg = new ArrayGenomicRegion(intr.getStart() - 2, intr.getStart(), intr.getEnd(), intr.getEnd() + 1); String cod = stop.get(SequenceUtils.extractSequence(reg, sequence)); if (cod != null) stopCodons.add(new Codon(reg, cod)); reg = new ArrayGenomicRegion(intr.getStart() - 1, intr.getStart(), intr.getEnd(), intr.getEnd() + 2); cod = stop.get(SequenceUtils.extractSequence(reg, sequence)); if (cod != null) stopCodons.add(new Codon(reg, cod)); } stopCodons.removeAll(codons); codons.addAll(stopCodons); ArrayList<OrfWithCodons> re = new ArrayList<OrfWithCodons>(); HashSet<Codon> usedForAnno = new HashSet<Codon>(); if (assembleAnnotationFirst) { // new: first use annotated transcripts in a greedy fashion ArrayList<ImmutableReferenceGenomicRegion<Transcript>> transcripts = annotation.ei(codonsRegion) .filter(t -> t.getData().isCoding()).map(t -> codonsRegion.induce(t, "T")).list(); int acount = 0; LinkedList<OrfWithCodons> orfs = new LinkedList<OrfWithCodons>(); GenomicRegion best; HashSet<Codon> aremoved = new HashSet<Codon>(); do { best = null; double bestSum = 0; for (ImmutableReferenceGenomicRegion<Transcript> tr : transcripts) { double[] a = new double[tr.getRegion().getTotalLength()]; for (Codon c : codons) { if (tr.getRegion().containsUnspliced(c)) { int p = tr.induce(c.getStart()); assert a[p] == 0; if (!aremoved.contains(c)) a[p] = c.totalActivity; if (c.isStop()) a[p] = -1; } } for (int f = 0; f < 3; f++) { int s = -1; double sum = 0; for (int p = f; p < a.length; p += 3) { if (a[p] == -1) {//stop if (sum > bestSum) { bestSum = sum; best = tr.getRegion().map(new ArrayGenomicRegion(s, p + 3)); } s = -1; sum = 0; } else sum += a[p]; if (a[p] > 0 && s == -1) s = p; } } } if (best != null) { ArrayList<Codon> cods = new ArrayList<>(); int uniqueCodons = 0; double uniqueActivity = 0; double totalActivity = 0; for (Codon c : codons) { if (best.containsUnspliced(c) && best.induce(c.getStart()) % 3 == 0) { if (aremoved.add(c)) { uniqueActivity += c.totalActivity; uniqueCodons++; } totalActivity += c.totalActivity; if (c.totalActivity > 0) cods.add(c); } } // System.out.println(codonsRegion.map(best)); if ((uniqueCodons >= minUniqueCodons || uniqueCodons == cods.size()) && uniqueActivity > minUniqueActivity && totalActivity > minOrfTotalActivity) { Collections.sort(cods); usedForAnno.addAll(cods); OrfWithCodons orf = new OrfWithCodons(index, 0, acount++, best.toArrayGenomicRegion(), cods, true); orfs.add(orf); } } } while (best != null); if (orfs.size() > 1) { // they are not necessarily connected! LinkedList<OrfWithCodons>[] connected = findConnectedOrfs(orfs); orfs.clear(); for (LinkedList<OrfWithCodons> corfs : connected) { for (boolean changed = true; changed && corfs.size() > 1;) { changed = false; if (useEM) inferOverlappingOrfActivitiesEM(corfs); else overlapUniqueCoverage(corfs); Iterator<OrfWithCodons> it = corfs.iterator(); while (it.hasNext()) { OrfWithCodons orf = it.next(); if (orf.getEstimatedTotalActivity() < minOrfTotalActivity) { it.remove(); changed = true; } } } if (corfs.size() > 1) distributeCodons(corfs); orfs.addAll(corfs); } } re.addAll(orfs); } // as edges only are represented in the splice graph, singleton codons are discarded (which does make sense anyway) for (Codon c : codons) { if (!c.isStop()) { // find unspliced successors (can be more than one, when the successor codon itself is spliced! all of them have the same start!) int max = c.getEnd() + maxAminoDist * 3; for (Codon n : codons .getIntervalsIntersecting(c.getEnd(), c.getEnd() + maxAminoDist * 3, buff.startAndClear(c)) .get()) { if (!containsInframeStop(sequence.substring(c.getEnd(), n.getStart()))) fg.addInteraction(c, n); max = n.getStart() + 2; } // find all spliced successors for each splice junction that comes before n or maxAminoDist sg.forEachIntronStartingBetween(c.getEnd(), max + 1, intron -> { for (Codon n : codons.getIntervalsIntersecting(intron.getEnd(), intron.getEnd() + maxAminoDist * 3 - (intron.getStart() - c.getEnd()), buff.startAndClear(c, intron)).get()) if (!containsInframeStop(SequenceUtils.extractSequence(new ArrayGenomicRegion(c.getStart(), intron.getStart(), intron.getEnd(), n.getStart()), sequence))) fg.addInteraction(c, n, intron); }); } } int cc = 1; for (SimpleDirectedGraph<Codon> g : fg.getWeaklyConnectedComponents()) { if (EI.wrap(g.getSources()).mapToDouble(c -> c.getTotalActivity()).sum() == 0) continue; // iterate longest paths in g LinkedList<Codon> topo = g.getTopologicalOrder(); HashSet<Codon> remInTopo = new HashSet<Codon>(topo); remInTopo.removeIf(c -> !stopCodons.contains(c) && !usedForAnno.contains(c)); HashSet<Codon> removed = new HashSet<Codon>(remInTopo); // double maxPathScore = 0; LinkedList<OrfWithCodons> orfs = new LinkedList<OrfWithCodons>(); int count = 0; while (removed.size() < topo.size()) { HashMap<Codon, MutablePair<GenomicRegion, Double>> longestPrefixes = new HashMap<Codon, MutablePair<GenomicRegion, Double>>(); for (Codon c : topo) longestPrefixes.put(c, new MutablePair<GenomicRegion, Double>(c, removed.contains(c) ? 0 : (c.getTotalActivity()))); Codon longestEnd = null; HashMap<Codon, Codon> backtracking = new HashMap<Codon, Codon>(); for (Codon c : topo) { // if (codonsRegion.map(c).getStart()==100_466_118) // System.out.println(c); // // if (codonsRegion.map(c).getStart()==100_465_842) // System.out.println(c); double len = longestPrefixes.get(c).Item2; for (AdjacencyNode<Codon> n = g.getTargets(c); n != null; n = n.next) { MutablePair<GenomicRegion, Double> pref = longestPrefixes.get(n.node); double nnact = removed.contains(n.node) ? 0 : (n.node.getTotalActivity()); if (pref.Item2 <= len + nnact) { pref.set(extendFullPath(longestPrefixes.get(c).Item1, c, n.node, n.getLabel()), len + nnact); backtracking.put(n.node, c); } } if (longestEnd == null || longestPrefixes.get(longestEnd).Item2 <= len) longestEnd = c; } // determine longest path by backtracking and mark all codons on the path as removed ArrayList<Codon> orfCodons = new ArrayList<Codon>(); double totalActivity = 0; double uniqueActivity = 0; int uniqueCodons = 0; for (Codon c = longestEnd; c != null; c = backtracking.get(c)) { if (removed.add(c) && c.getTotalActivity() > 0) { uniqueCodons++; uniqueActivity += c.getTotalActivity(); } if (c.getTotalActivity() > 0) // to remove dummy stop codons orfCodons.add(c); totalActivity += c.getTotalActivity(); } // System.out.println(codonsRegion.map(longestPrefixes.get(longestEnd).Item1)); if ((uniqueCodons >= minUniqueCodons || uniqueCodons == orfCodons.size()) && uniqueActivity > minUniqueActivity && totalActivity > minOrfTotalActivity) { Collections.reverse(orfCodons); MutablePair<GenomicRegion, Double> triple = longestPrefixes.get(longestEnd); ArrayGenomicRegion region = triple.Item1.toArrayGenomicRegion(); String lastCodon = SequenceUtils.extractSequence( region.map( new ArrayGenomicRegion(region.getTotalLength() - 3, region.getTotalLength())), sequence); OrfWithCodons orf = new OrfWithCodons(index, cc, count++, region, orfCodons, stop.containsKey(lastCodon)); orfs.add(orf); } // maxPathScore = Math.max(maxPathScore,totalActivity); } if (orfs.size() > 1) { // they are not necessarily connected! LinkedList<OrfWithCodons>[] connected = findConnectedOrfs(orfs); orfs.clear(); for (LinkedList<OrfWithCodons> corfs : connected) { for (boolean changed = true; changed && corfs.size() > 1;) { changed = false; if (useEM) inferOverlappingOrfActivitiesEM(corfs); else overlapUniqueCoverage(corfs); Iterator<OrfWithCodons> it = corfs.iterator(); while (it.hasNext()) { OrfWithCodons orf = it.next(); if (orf.getEstimatedTotalActivity() < minOrfTotalActivity) { it.remove(); changed = true; } } } if (corfs.size() > 1) distributeCodons(corfs); orfs.addAll(corfs); } } re.addAll(orfs); cc++; } return re; }
From source file:edu.harvard.iq.dvn.ingest.dsb.impl.DvnRJobRequest.java
public List<List<String>> getValueRange(String tkn) { dbgLog.fine("received token=" + tkn); String step0 = StringUtils.strip(tkn); dbgLog.fine("step0=" + step0); // string into tokens String[] step1raw = step0.split(","); dbgLog.fine("step1raw=" + StringUtils.join(step1raw, ",")); // remove meaningless commas if exist List<String> step1 = new ArrayList<String>(); for (String el : step1raw) { if (!el.equals("")) { step1.add(el);//from ww w .j ava 2s .c o m } } dbgLog.fine("step1=" + StringUtils.join(step1, ",")); List<List<String>> rangeData = new ArrayList<List<String>>(); // for each token, check the range operator for (int i = 0; i < step1.size(); i++) { LinkedList<String> tmp = new LinkedList<String>( Arrays.asList(String2StringArray(String.valueOf(step1.get(i))))); Map<String, String> token = new HashMap<String, String>(); boolean rangeMode = false; // .get(i) below CAN'T possibly be right (??) -- replacing // it with .get(0). -- L.A., v3.6 //if ((!tmp.get(i).equals("[")) && (!tmp.get(i).equals("("))){ if ((!tmp.get(0).equals("[")) && (!tmp.get(0).equals("("))) { // no LHS range operator // assume [ token.put("start", "3"); } else if (tmp.get(0).equals("[")) { rangeMode = true; token.put("start", "3"); tmp.removeFirst(); } else if (tmp.get(0).equals("(")) { rangeMode = true; token.put("start", "5"); tmp.removeFirst(); } if ((!tmp.getLast().equals("]")) && (!tmp.getLast().equals(")"))) { // no RHS range operator // assume ] token.put("end", "4"); } else if (tmp.getLast().equals("]")) { rangeMode = true; tmp.removeLast(); token.put("end", "4"); } else if (tmp.getLast().equals(")")) { rangeMode = true; tmp.removeLast(); token.put("end", "6"); } // I'm now enforcing the following rules: // the "rangeMode" above - a range must have at least one range // operator, a square bracket or parenthesis, on one end, at // least; i.e., either on the left, or on the right. // If there are no range operators, even if there are dashes // inside the token, they are not going to be interpreted as // range definitions. // still TODO: (possibly?) add more validation; figure out how // to encode *date* ranges ("-" is not optimal, since dates already // contain dashes... although, since dates are (supposed to be) // normalized it should still be possible to parse it unambiguously) // -- L.A., v3.6 if (rangeMode) { // after these steps, the string does not have range operators; // i.e., '-9--3', '--9', '-9-','-9', '-1-1', '1', '3-4', '6-' if ((tmp.get(0).equals("!")) && (tmp.get(1).equals("="))) { // != negation string is found token.put("start", "2"); token.put("end", ""); token.put("v1", StringUtils.join(tmp.subList(2, tmp.size()), "")); token.put("v2", ""); dbgLog.fine("value=" + StringUtils.join(tmp.subList(2, tmp.size()), ",")); } else if ((tmp.get(0).equals("-")) && (tmp.get(1).equals("-"))) { // type 2: --9 token.put("v1", ""); tmp.removeFirst(); token.put("v2", StringUtils.join(tmp, "")); } else if ((tmp.get(0).equals("-")) && (tmp.getLast().equals("-"))) { // type 3: -9- token.put("v2", ""); tmp.removeLast(); token.put("v1", StringUtils.join(tmp, "")); } else if ((!tmp.get(0).equals("-")) && (tmp.getLast().equals("-"))) { // type 8: 6- token.put("v2", ""); tmp.removeLast(); token.put("v1", StringUtils.join(tmp, "")); } else { int count = 0; List<Integer> index = new ArrayList<Integer>(); for (int j = 0; j < tmp.size(); j++) { if (tmp.get(j).equals("-")) { count++; index.add(j); } } if (count >= 2) { // range type // divide the second hyphen // types 1 and 5: -9--3, -1-1 // token.put("v1", StringUtils.join(tmp[0..($index[1]-1)],"" )); token.put("v2", StringUtils.join(tmp.subList((index.get(1) + 1), tmp.size()), "")); } else if (count == 1) { if (tmp.get(0).equals("-")) { // point negative type // type 4: -9 or -inf,9 // do nothing if ((token.get("start").equals("5")) && ((token.get("end").equals("6")) || (token.get("end").equals("4")))) { token.put("v1", ""); tmp.removeFirst(); token.put("v2", StringUtils.join(tmp, "")); } else { token.put("v1", StringUtils.join(tmp, "")); token.put("v2", StringUtils.join(tmp, "")); } } else { // type 7: 3-4 // both positive value and range type String[] vset = (StringUtils.join(tmp, "")).split("-"); token.put("v1", vset[0]); token.put("v2", vset[1]); } } else { // type 6: 1 token.put("v1", StringUtils.join(tmp, "")); token.put("v2", StringUtils.join(tmp, "")); } } } else { // assume that this is NOT a range; treat the entire sequence // of symbols as a single token: // type 6: 1 token.put("v1", StringUtils.join(tmp, "")); token.put("v2", StringUtils.join(tmp, "")); } dbgLog.fine(i + "-th result=" + token.get("start") + "|" + token.get("v1") + "|" + token.get("end") + "|" + token.get("v2")); List<String> rangeSet = new ArrayList<String>(); rangeSet.add(token.get("start")); rangeSet.add(token.get("v1")); rangeSet.add(token.get("end")); rangeSet.add(token.get("v2")); rangeData.add(rangeSet); } dbgLog.fine("rangeData:\n" + rangeData); return rangeData; }
From source file:hr.fer.spocc.regex.AbstractRegularExpression.java
protected RegularExpression<T> createParseTree(List<RegularExpressionElement> elements) { // System.out.println(">>> Parsing regexp: "+elements); /**/*from w ww . jav a 2 s . co m*/ * Stack which contains parts of regular expression * which are not yet used * by the operator. In addition, <code>null</code> values * can be pushed onto this stack to indicate that * the symbols to the right are grouped by the parenthesis. * */ LinkedList<RegularExpression<T>> symbolStack = new LinkedList<RegularExpression<T>>(); /** * Operator stack */ LinkedList<RegularExpressionOperator> opStack = new LinkedList<RegularExpressionOperator>(); boolean sentinelParentheses = false; // if (this.elements.get(0).getElementType() // != RegularExpressionElementType.LEFT_PARENTHESIS // || this.elements.get(elements.size()-1).getElementType() // != RegularExpressionElementType.RIGHT_PARENTHESIS) { sentinelParentheses = true; symbolStack.push(null); opStack.push(null); // } int ind = -1; Iterator<RegularExpressionElement> iter = elements.iterator(); while (iter.hasNext() || sentinelParentheses) { ++ind; RegularExpressionElement e; if (iter.hasNext()) { e = iter.next(); } else { // osiguraj dodatnu iteraciju za umjetnu zadnju ) e = RegularExpressionElements.RIGHT_PARENTHESIS; sentinelParentheses = false; } switch (e.getElementType()) { case SYMBOL: symbolStack.push(createTrivial(elements.subList(ind, ind + 1))); break; default: RegularExpressionOperator curOp = (e.getElementType() == RegularExpressionElementType.OPERATOR ? (RegularExpressionOperator) e : null); int priority = (curOp != null ? curOp.getPriority() : -1); if (e.getElementType() != RegularExpressionElementType.LEFT_PARENTHESIS) { // System.out.println("Pre-while symbolStack: "+symbolStack); while (!opStack.isEmpty() && opStack.getFirst() != null && opStack.getFirst().getPriority() >= priority && symbolStack.getFirst() != null) { RegularExpressionOperator op = opStack.pop(); int arity = op.getArity(); int elementCount = 0; // System.out.println("POP: "+op); @SuppressWarnings("unchecked") RegularExpression<T>[] operands = new RegularExpression[arity]; for (int i = arity - 1; i >= 0; --i) { if (symbolStack.isEmpty()) { throw new IllegalArgumentException("Missing ( after"); } else if (symbolStack.getFirst() == null) { throw new IllegalArgumentException("Missing operand #" + (arity - i) + " for the operator " + op + " before index " + ind); } operands[i] = symbolStack.pop(); elementCount += operands[i].size(); } RegularExpression<T> regex = createComposite(elements.subList(ind - elementCount - 1, ind), op, operands); // System.err.println(regex); // System.err.println(regex.getSubexpression(0)); symbolStack.push(regex); // System.out.println("Group: "+ // ArrayToStringUtils.toString(operands, "\n")); // System.out.println("End group"); // System.out.println("Evaluated [" + (ind-elementCount-1) // + ", " + ind + "): "+regex); // System.out.println("Symbol stack: "+symbolStack); // System.out.println("Op stack: "+opStack); // System.out.println("---"); } } if (curOp != null) { opStack.push(curOp); } else { switch (e.getElementType()) { case LEFT_PARENTHESIS: symbolStack.push(null); opStack.push(null); break; default: // ako je ) Validate.isTrue(symbolStack.size() >= 2, "Exactly one expression is expected " + "inside parentheses before index " + ind); // pop left bracket (null) from the operator stack Object nullValue = opStack.pop(); Validate.isTrue(nullValue == null); // pop left bracket (null) from the symbol stack RegularExpression<T> regex = symbolStack.pop(); nullValue = symbolStack.pop(); // check if left bracket was removed indeed // Validate.isTrue(nullValue == null, // "Expected ( at index " + (ind-regex.size()-1)); // expand the expression if parentheses are not sentinel if (sentinelParentheses) { // XXX neki drugi flag bolje // System.out.print("Expand [" // + (ind - regex.size() - 1) + ", " // + (ind + 1) + "]: "); // System.out.println("[regex size = "+regex.size() // + "]"); regex = createExpanded(regex, elements.subList(ind - regex.size() - 1, ind + 1)); // System.out.println(" -> "+regex); } // and put back the expression inside parentheses symbolStack.push(regex); } } } // end of switch // System.out.println("----- " + ind + " ----"); // System.out.println("Symbol stack: "+symbolStack); // System.out.println("Op stack: "+opStack); } //Validate.isTrue(symbolStack.size() == 1); //Validate.isTrue(opStack.isEmpty()); return symbolStack.pop(); }
From source file:com.edgenius.wiki.render.filter.MacroFilter.java
/** *//*w w w . j a v a 2 s .c om*/ private void checkGroup(final int initPos, CharSequence input, final LinkedList<GroupProcessor> stack, List<GroupProcessor> processors) { final List<Region> pairRegions = new ArrayList<Region>(); singleMacroProvider.replaceByTokenVisitor(input, new TokenVisitor<Matcher>() { public void handleMatch(StringBuffer buffer, Matcher result) { String macroName = result.group(1); if (macroName != null && !macroName.startsWith("$")) { Macro macro = macroMgr.getMacro(macroName); if (macro != null) { //IMPORTANT: here does not check Macro.isPair() and also put it into pairRegions for following process //it is the sequence of process must keep consistant with physical sequence in input text, //for example, {table}{cell}...{rowdiv}, rowdiv is single and must be after cell int start = result.start(0); int end = result.end(0); Region pair = new Region(start, end); //no parameter, then mark as unknown, otherwise, must be a start macro if (StringUtils.isBlank(result.group(2))) { pair.setKey(MACRO_REGION_KEY_UNKNOWN); } else { pair.setKey(MACRO_REGION_KEY_START); } //just for temporary to remember the macro name... pair.setContent(macroName); //sum to list pairRegions.add(pair); } } } }); int size = pairRegions.size(); if (size > 0) { StringBuffer inputBuf = new StringBuffer(input); for (int idx = 0; idx < size; idx++) { Region reg = pairRegions.get(idx); Macro macro = macroMgr.getMacro(reg.getContent()); if (macro.isPaired()) { int deep = 0; Region pair = null; //looking for pairs... for (int chIdx = idx + 1; chIdx < size; chIdx++) { Region next = pairRegions.get(chIdx); if (StringUtils.equalsIgnoreCase(reg.getContent(), next.getContent())) { //start is unknown (no attribute), then end must be unknown if (MACRO_REGION_KEY_UNKNOWN.equals(reg.getKey()) && MACRO_REGION_KEY_UNKNOWN.equals(next.getKey())) { //matched pair = next; //skip all internal node - which is handle by embedded recursive idx = chIdx; break; } if (MACRO_REGION_KEY_START.equals(reg.getKey()) && MACRO_REGION_KEY_UNKNOWN.equals(next.getKey())) { if (deep == 0) { //matched; pair = next; //skip all internal node - which is handle by embedded recursive idx = chIdx; break; } else { //just another inner same name macro matched, deep minus 1 deep--; } } if (MACRO_REGION_KEY_START.equals(next.getKey())) { //ok, it gets another start, in 4th scenarios - then add deep deep++; } } } //ok, success find paired if (pair != null) { int start = initPos + reg.getStart(); int end = initPos + pair.getEnd(); int contentStart = initPos + reg.getEnd(); int contentEnd = initPos + pair.getStart(); GroupProcessor currProcessor = stack.size() == 0 ? null : stack.getLast(); if (currProcessor != null) { currProcessor.adoptChild(macro, start, end); } if (macro.isProcessEmbedded() && (end > start)) { if (macro.hasChildren() != null) { stack.add(((GroupProcessorMacro) macro).newGroupProcessor(macro, start, end)); } checkGroup(contentStart, inputBuf.subSequence(contentStart - initPos, contentEnd - initPos), stack, processors); if (macro.hasChildren() != null) { //pop the current one, means it is a completed GroupProcessor processors.add(stack.removeLast()); } } } } else { //single macro - directly detect if it is child GroupProcessor currProcessor = stack.size() == 0 ? null : stack.getLast(); if (currProcessor != null) { currProcessor.adoptChild(macro, initPos + reg.getStart(), initPos + reg.getEnd()); } } } } }
From source file:elh.eus.absa.Features.java
/** * Function fills the attribute vectors for the instances existing in the corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * //from www .j av a 2 s .co m * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstances(boolean save, String prefix) throws IOException { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //Properties posProp = new Properties(); //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); if (params.containsKey("lemmaNgrams")) { Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"), corpus.getLang(), "3", "bin", "false"); postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; // string normalization (emoticons, twitter grammar,...) String opNormalized = corpus.getOpinionSentence(oId); // compute uppercase ratio before normalization (if needed) double upRatio = 0.0; if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) { String upper = opNormalized.replaceAll("[\\p{Ll}]", ""); upRatio = (double) upper.length() / (double) opNormalized.length(); values[rsltdata.attribute("upperCaseRation").index()] = upRatio; } // string normalization (emoticons, twitter grammar,...) if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams")) && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) { opNormalized = normalize(opNormalized, params.getProperty("normalization", "none")); } //process the current instance with the NLP pipeline in order to get token and lemma|pos features KAFDocument nafinst = new KAFDocument("", ""); String nafname = trainExamples.get(oId).getsId().replace(':', '_'); String nafDir = params.getProperty("kafDir"); String nafPath = nafDir + File.separator + nafname + ".kaf"; //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = 1; try { if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty())) { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(), params.getProperty("pos-model"), postagger); Files.createDirectories(Paths.get(nafDir)); nafinst.save(nafPath); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } else { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang()); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } } catch (IOException | JDOMException e) { System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId + "|" + oId + ") for filling the attribute vector"); e.printStackTrace(); System.exit(5); } LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } List<WF> window = nafinst.getWFs(); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer to = window.size(); Integer from = 0; end++; for (int i = 0; i < window.size(); i++) { WF wf = window.get(i); if ((wf.getOffset() == start) && (i >= bowWin)) { from = i - bowWin; } else if (wf.getOffset() >= end) { if (i + bowWin < window.size()) { to = i + bowWin; } break; } } window = window.subList(from, to); //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); List<String> windowWFIds = new ArrayList<String>(); // word form ngram related features for (WF wf : window) { windowWFIds.add(wf.getId()); String wfStr = wf.getForm(); if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (Term t : nafinst.getTermsFromWFs(windowWFIds)) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } String lemma = t.getLemma(); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // add ngrams to the feature vector for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { Attribute ngAtt = rsltdata.attribute(ng); if (ngAtt != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } } ng = featureFromArray(ngrams.subList(0, i + 1), ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker } //end ngram checking } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } posNgrams.add(t.getPos()); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (rsltdata.attribute(ng) != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } // polarity lexicons if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker ngrams.removeFirst(); } //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId)); if (pol != null && !pol.isEmpty()) { //System.err.println("polarity: _"+pol+"_"); inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstances() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }
From source file:com.joliciel.talismane.TalismaneImpl.java
public void analyse(TalismaneConfig config) { try {/*from www . j ava 2 s . com*/ if (config.needsSentenceDetector()) { if (config.getSentenceDetector() == null) { throw new TalismaneException("Sentence detector not provided."); } } if (config.needsTokeniser()) { if (config.getTokeniser() == null) { throw new TalismaneException("Tokeniser not provided."); } } if (config.needsPosTagger()) { if (config.getPosTagger() == null) { throw new TalismaneException("Pos-tagger not provided."); } } if (config.needsParser()) { if (config.getParser() == null) { throw new TalismaneException("Parser not provided."); } } if (config.getEndModule().equals(Module.SentenceDetector)) { if (this.getSentenceProcessor() == null) { throw new TalismaneException( "No sentence processor provided with sentence detector end module, cannot generate output."); } } if (config.getEndModule().equals(Module.Tokeniser)) { if (this.getTokenSequenceProcessor() == null) { throw new TalismaneException( "No token sequence processor provided with tokeniser end module, cannot generate output."); } } if (config.getEndModule().equals(Module.PosTagger)) { if (this.getPosTagSequenceProcessor() == null) { throw new TalismaneException( "No postag sequence processor provided with pos-tagger end module, cannot generate output."); } } if (config.getEndModule().equals(Module.Parser)) { if (this.getParseConfigurationProcessor() == null) { throw new TalismaneException( "No parse configuration processor provided with parser end module, cannot generate output."); } } LinkedList<String> textSegments = new LinkedList<String>(); LinkedList<Sentence> sentences = new LinkedList<Sentence>(); TokenSequence tokenSequence = null; PosTagSequence posTagSequence = null; RollingSentenceProcessor rollingSentenceProcessor = this.getFilterService() .getRollingSentenceProcessor(config.getFileName(), config.isProcessByDefault()); Sentence leftover = null; if (config.getStartModule().equals(Module.SentenceDetector) || config.getStartModule().equals(Module.Tokeniser)) { // prime the sentence detector with two text segments, to ensure everything gets processed textSegments.addLast(""); textSegments.addLast(""); } StringBuilder stringBuilder = new StringBuilder(); boolean finished = false; int sentenceCount = 0; String prevProcessedText = ""; String processedText = ""; String nextProcessedText = ""; SentenceHolder prevSentenceHolder = null; int endBlockCharacterCount = 0; while (!finished) { if (config.getStartModule().equals(Module.SentenceDetector) || config.getStartModule().equals(Module.Tokeniser)) { // Note SentenceDetector and Tokeniser start modules treated identically, // except that for SentenceDetector we apply a probabilistic sentence detector // whereas for Tokeniser we assume all sentence breaks are marked by filters // read characters from the reader, one at a time char c; int r = -1; try { r = this.getReader().read(); } catch (IOException e) { LogUtils.logError(LOG, e); } if (r == -1) { finished = true; c = '\n'; } else { c = (char) r; } // Jump out if we have 3 consecutive end-block characters. if (c == config.getEndBlockCharacter()) { endBlockCharacterCount++; if (endBlockCharacterCount == 3) { LOG.info("Three consecutive end-block characters. Exiting."); finished = true; } } else { endBlockCharacterCount = 0; } // have sentence detector if (finished || (Character.isWhitespace(c) && stringBuilder.length() > config.getBlockSize()) || c == config.getEndBlockCharacter()) { if (c == config.getEndBlockCharacter()) stringBuilder.append(c); if (stringBuilder.length() > 0) { String textSegment = stringBuilder.toString(); stringBuilder = new StringBuilder(); textSegments.add(textSegment); } // is the current block > 0 characters? if (c == config.getEndBlockCharacter()) { textSegments.addLast(""); } } // is there a next block available? if (finished) { if (stringBuilder.length() > 0) { textSegments.addLast(stringBuilder.toString()); stringBuilder = new StringBuilder(); } textSegments.addLast(""); textSegments.addLast(""); textSegments.addLast(""); } if (c != config.getEndBlockCharacter()) stringBuilder.append(c); while (textSegments.size() >= 3) { String prevText = textSegments.removeFirst(); String text = textSegments.removeFirst(); String nextText = textSegments.removeFirst(); if (LOG.isTraceEnabled()) { LOG.trace("prevText: " + prevText); LOG.trace("text: " + text); LOG.trace("nextText: " + nextText); } Set<TextMarker> textMarkers = new TreeSet<TextMarker>(); for (TextMarkerFilter textMarkerFilter : config.getTextMarkerFilters()) { Set<TextMarker> result = textMarkerFilter.apply(prevText, text, nextText); textMarkers.addAll(result); } // push the text segments back onto the beginning of Deque textSegments.addFirst(nextText); textSegments.addFirst(text); SentenceHolder sentenceHolder = rollingSentenceProcessor.addNextSegment(text, textMarkers); prevProcessedText = processedText; processedText = nextProcessedText; nextProcessedText = sentenceHolder.getText(); if (LOG.isTraceEnabled()) { LOG.trace("prevProcessedText: " + prevProcessedText); LOG.trace("processedText: " + processedText); LOG.trace("nextProcessedText: " + nextProcessedText); } boolean reallyFinished = finished && textSegments.size() == 3; if (prevSentenceHolder != null) { if (config.getStartModule().equals(Module.SentenceDetector)) { List<Integer> sentenceBreaks = config.getSentenceDetector() .detectSentences(prevProcessedText, processedText, nextProcessedText); for (int sentenceBreak : sentenceBreaks) { prevSentenceHolder.addSentenceBoundary(sentenceBreak); } } List<Sentence> theSentences = prevSentenceHolder.getDetectedSentences(leftover); leftover = null; for (Sentence sentence : theSentences) { if (sentence.isComplete() || reallyFinished) { sentences.add(sentence); sentenceCount++; } else { LOG.debug("Setting leftover to: " + sentence.getText()); leftover = sentence; } } if (config.getMaxSentenceCount() > 0 && sentenceCount >= config.getMaxSentenceCount()) { finished = true; } } prevSentenceHolder = sentenceHolder; } // we have at least 3 text segments (should always be the case once we get started) } else if (config.getStartModule().equals(Module.PosTagger)) { if (config.getTokenCorpusReader().hasNextTokenSequence()) { tokenSequence = config.getTokenCorpusReader().nextTokenSequence(); } else { tokenSequence = null; finished = true; } } else if (config.getStartModule().equals(Module.Parser)) { if (config.getPosTagCorpusReader().hasNextPosTagSequence()) { posTagSequence = config.getPosTagCorpusReader().nextPosTagSequence(); } else { posTagSequence = null; finished = true; } } // which start module? boolean needToProcess = false; if (config.getStartModule().equals(Module.SentenceDetector) || config.getStartModule().equals(Module.Tokeniser)) needToProcess = !sentences.isEmpty(); else if (config.getStartModule().equals(Module.PosTagger)) needToProcess = tokenSequence != null; else if (config.getStartModule().equals(Module.Parser)) needToProcess = posTagSequence != null; while (needToProcess) { Sentence sentence = null; if (config.getStartModule().compareTo(Module.Tokeniser) <= 0 && config.getEndModule().compareTo(Module.SentenceDetector) >= 0) { sentence = sentences.poll(); LOG.debug("Sentence: " + sentence); if (this.getSentenceProcessor() != null) this.getSentenceProcessor().onNextSentence(sentence.getText(), this.getWriter()); } // need to read next sentence List<TokenSequence> tokenSequences = null; if (config.needsTokeniser()) { tokenSequences = config.getTokeniser().tokenise(sentence); tokenSequence = tokenSequences.get(0); if (this.getTokenSequenceProcessor() != null) { this.getTokenSequenceProcessor().onNextTokenSequence(tokenSequence, this.getWriter()); } } // need to tokenise ? List<PosTagSequence> posTagSequences = null; if (config.needsPosTagger()) { posTagSequence = null; if (tokenSequences == null || !config.isPropagateTokeniserBeam()) { tokenSequences = new ArrayList<TokenSequence>(); tokenSequences.add(tokenSequence); } if (config.getPosTagger() instanceof NonDeterministicPosTagger) { NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) config .getPosTagger(); posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences); posTagSequence = posTagSequences.get(0); } else { posTagSequence = config.getPosTagger().tagSentence(tokenSequence); } if (posTagSequenceProcessor != null) { posTagSequenceProcessor.onNextPosTagSequence(posTagSequence, this.getWriter()); } tokenSequence = null; } // need to postag if (config.needsParser()) { if (posTagSequences == null || !config.isPropagatePosTaggerBeam()) { posTagSequences = new ArrayList<PosTagSequence>(); posTagSequences.add(posTagSequence); } ParseConfiguration parseConfiguration = null; List<ParseConfiguration> parseConfigurations = null; try { if (config.getParser() instanceof NonDeterministicParser) { NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) config .getParser(); parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences); parseConfiguration = parseConfigurations.get(0); } else { parseConfiguration = config.getParser().parseSentence(posTagSequence); } if (this.getParseConfigurationProcessor() != null) { this.getParseConfigurationProcessor().onNextParseConfiguration(parseConfiguration, this.getWriter()); } } catch (Exception e) { LOG.error(e); if (stopOnError) throw new RuntimeException(e); } posTagSequence = null; } // need to parse if (config.getStartModule().equals(Module.SentenceDetector) || config.getStartModule().equals(Module.Tokeniser)) needToProcess = !sentences.isEmpty(); else if (config.getStartModule().equals(Module.PosTagger)) needToProcess = tokenSequence != null; else if (config.getStartModule().equals(Module.Parser)) needToProcess = posTagSequence != null; } // next sentence } // next character } finally { if (this.getParseConfigurationProcessor() != null) { this.getParseConfigurationProcessor().onCompleteParse(); } try { this.getReader().close(); this.getWriter().flush(); this.getWriter().close(); } catch (IOException ioe2) { LOG.error(ioe2); throw new RuntimeException(ioe2); } } }
From source file:elh.eus.absa.Features.java
/** * Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * //w ww .j a va2 s .co m * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstancesTAB(boolean save, String prefix) { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } String[] noWindow = corpus.getOpinionSentence(oId).split("\n"); //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = noWindow.length; List<String> window = Arrays.asList(noWindow); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer from = start - bowWin; if (from < 0) { from = 0; } Integer to = end + bowWin; if (to > noWindow.length - 1) { to = noWindow.length - 1; } window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to)); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); //System.err.println(Arrays.toString(window.toArray())); // word form ngram related features for (String wf : window) { String[] fields = wf.split("\t"); String wfStr = normalize(fields[0], params.getProperty("normalization", "none")); // blank line means we found a sentence end. Empty n-gram list and reiniciate. if (wf.equals("")) { // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, true); //toknum // since wf is empty no need to check for clusters and other features. continue; } if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (String t : window) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } //blank line means we found a sentence end. Empty n-gram list and reiniciate. if (t.equals("")) { // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum // since t is empty no need to check for clusters and other features. continue; } String[] fields = t.split("\t"); if (fields.length < 2) { continue; } String lemma = normalize(fields[1], params.getProperty("normalization", "none")); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams); } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } String[] fields = t.split("\t"); if (fields.length < 3) { continue; } String pos = fields[2]; posNgrams.add(pos); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list // check both lemma n-grams and polarity lexicons, and add values to the feature vector checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams); //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } // compute uppercase ratio before normalization (if needed) //double upRatio =0.0; //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) //{ // String upper = opNormalized.replaceAll("[a-z]", ""); // upRatio = (double)upper.length() / (double)opNormalized.length(); // values[rsltdata.attribute("upperCaseRation").index()] = upRatio; //} //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); if (pol != null && !pol.isEmpty()) { inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { //System.err.println("polarity: _"+pol+"_"); inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }