Example usage for java.util LinkedList size

List of usage examples for java.util LinkedList size

Introduction

In this page you can find the example usage for java.util LinkedList size.

Prototype

int size

To view the source code for java.util LinkedList size.

Click Source Link

Usage

From source file:eu.aniketos.scpm.impl.CompositionPlanner.java

private List<ICompositionPlan> createCompositionPlans(ICompositionPlan serviceSpecification,
        Map<String, Set<Service>> mapTaskServices) {

    //Creation of several composition plans combining the service locations

    if (logger.isDebugEnabled()) {
        logger.debug("Creating composition Plans for process " + serviceSpecification.getCompositionPlanID()); //$NON-NLS-1$
    }//w  w  w  .ja v  a2s.  c  o m

    Set<String> taskIds = mapTaskServices.keySet();

    Object[] taskIdsArray = (Object[]) taskIds.toArray();

    LinkedList<ICompositionPlan> listCompositionPlan = new LinkedList<ICompositionPlan>();

    XMLOutputter xmlOutput = new XMLOutputter();
    xmlOutput.setFormat(Format.getPrettyFormat());

    Iterator<String> itTaskIds = taskIds.iterator();

    List<Set<Service>> list = new LinkedList<Set<Service>>();
    while (itTaskIds.hasNext()) {
        String taskId = itTaskIds.next();
        Set<Service> services = mapTaskServices.get(taskId);
        Set<String> locations = new HashSet<String>();
        for (Service service : services) {
            locations.add(service.getLocation());
        }
        list.add(services);
    }

    Set<List<Service>> set = Sets.cartesianProduct(list);
    Object[] setArray = (Object[]) set.toArray();

    for (int j = 0; j < setArray.length; j++) {
        Document newDocument = BPMNParser.getDocument(serviceSpecification.getBPMNXML());

        //   List<String> locationsParsed = new Vector<String>();

        @SuppressWarnings("unchecked")
        List<Object> listS = (List<Object>) setArray[j];
        for (int i = taskIdsArray.length - 1; i >= 0; i--) {
            String taskId = (String) taskIdsArray[i];
            //            String taskId = serviceIds.get(i);
            Service service = (Service) listS.get(i);
            String location = service.getLocation();
            //            if(!locationsParsed.contains(location)){
            //               String portTypeName = WsdlParser.getInstance().getPortTypeName(location);
            //               Element interfaceElement = XMLParser.addPortTypeName(newDocument, portTypeName);
            //               Map<String,List<String>> hashOperationsData = WsdlParser.getInstance().getOperationsData(location);
            //               Set<String> setOperations = hashOperationsData.keySet();
            //               for(String operation : setOperations){
            //                  List<String> listInputOutput = hashOperationsData.get(operation);
            //                  XMLParser.addOperation(newDocument, interfaceElement, portTypeName, operation, listInputOutput);
            //               }
            //               locationsParsed.add(location);
            //            }

            String serviceId = service.getServiceId();
            logger.debug("ServiceId: " + serviceId);

            String provider = service.getProvider();

            BPMNParser.addServiceId(newDocument, taskId, serviceId);
            BPMNParser.addLocationField(newDocument, taskId, location);
            BPMNParser.addProviderField(newDocument, taskId, provider);
        }
        int c = j + 1;
        BPMNParser.addProcessId(newDocument, "compositionPlan" + c);
        logger.debug("Created compositon plan" + c);

        String specification = xmlOutput.outputString(newDocument);
        ICompositionPlan compositionPlan = new CompositionPlan(specification);
        compositionPlan.setCompositionPlanID("compositionPlan" + c);

        //Interaction with the composition planner

        listCompositionPlan.add(compositionPlan);
        logger.debug("compositionPlan " + compositionPlan.getBPMNXML());

    }

    logger.info("Created " + listCompositionPlan.size() + " Composition Plans");

    return listCompositionPlan;

}

From source file:com.buaa.cfs.conf.Configuration.java

private Resource loadResource(Properties properties, Resource wrapper, boolean quiet) {
    String name = UNKNOWN_RESOURCE;
    try {/*from   ww w .  j  a  va2s.  c om*/
        Object resource = wrapper.getResource();
        name = wrapper.getName();

        DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
        //ignore all comments inside the xml file
        docBuilderFactory.setIgnoringComments(true);

        //allow includes in the xml file
        docBuilderFactory.setNamespaceAware(true);
        try {
            docBuilderFactory.setXIncludeAware(true);
        } catch (UnsupportedOperationException e) {
            LOG.error("Failed to set setXIncludeAware(true) for parser " + docBuilderFactory + ":" + e, e);
        }
        DocumentBuilder builder = docBuilderFactory.newDocumentBuilder();
        Document doc = null;
        Element root = null;
        boolean returnCachedProperties = false;

        if (resource instanceof URL) { // an URL resource
            doc = parse(builder, (URL) resource);
        } else if (resource instanceof String) { // a CLASSPATH resource
            URL url = getResource((String) resource);
            doc = parse(builder, url);
        } else if (resource instanceof Path) { // a file resource
            // Can't use FileSystem API or we get an infinite loop
            // since FileSystem uses Configuration API.  Use java.io.File instead.
            File file = new File(((Path) resource).toUri().getPath()).getAbsoluteFile();
            if (file.exists()) {
                if (!quiet) {
                    LOG.debug("parsing File " + file);
                }
                doc = parse(builder, new BufferedInputStream(new FileInputStream(file)),
                        ((Path) resource).toString());
            }
        } else if (resource instanceof InputStream) {
            doc = parse(builder, (InputStream) resource, null);
            returnCachedProperties = true;
        } else if (resource instanceof Properties) {
            overlay(properties, (Properties) resource);
        } else if (resource instanceof Element) {
            root = (Element) resource;
        }

        if (root == null) {
            if (doc == null) {
                if (quiet) {
                    return null;
                }
                throw new RuntimeException(resource + " not found");
            }
            root = doc.getDocumentElement();
        }
        Properties toAddTo = properties;
        if (returnCachedProperties) {
            toAddTo = new Properties();
        }
        if (!"configuration".equals(root.getTagName()))
            LOG.fatal("bad conf file: top-level element not <configuration>");
        NodeList props = root.getChildNodes();
        DeprecationContext deprecations = deprecationContext.get();
        for (int i = 0; i < props.getLength(); i++) {
            Node propNode = props.item(i);
            if (!(propNode instanceof Element))
                continue;
            Element prop = (Element) propNode;
            if ("configuration".equals(prop.getTagName())) {
                loadResource(toAddTo, new Resource(prop, name), quiet);
                continue;
            }
            if (!"property".equals(prop.getTagName()))
                LOG.warn("bad conf file: element not <property>");
            NodeList fields = prop.getChildNodes();
            String attr = null;
            String value = null;
            boolean finalParameter = false;
            LinkedList<String> source = new LinkedList<String>();
            for (int j = 0; j < fields.getLength(); j++) {
                Node fieldNode = fields.item(j);
                if (!(fieldNode instanceof Element))
                    continue;
                Element field = (Element) fieldNode;
                if ("name".equals(field.getTagName()) && field.hasChildNodes())
                    attr = StringInterner.weakIntern(((Text) field.getFirstChild()).getData().trim());
                if ("value".equals(field.getTagName()) && field.hasChildNodes())
                    value = StringInterner.weakIntern(((Text) field.getFirstChild()).getData());
                if ("final".equals(field.getTagName()) && field.hasChildNodes())
                    finalParameter = "true".equals(((Text) field.getFirstChild()).getData());
                if ("source".equals(field.getTagName()) && field.hasChildNodes())
                    source.add(StringInterner.weakIntern(((Text) field.getFirstChild()).getData()));
            }
            source.add(name);

            // Ignore this parameter if it has already been marked as 'final'
            if (attr != null) {
                if (deprecations.getDeprecatedKeyMap().containsKey(attr)) {
                    DeprecatedKeyInfo keyInfo = deprecations.getDeprecatedKeyMap().get(attr);
                    keyInfo.clearAccessed();
                    for (String key : keyInfo.newKeys) {
                        // update new keys with deprecated key's value
                        loadProperty(toAddTo, name, key, value, finalParameter,
                                source.toArray(new String[source.size()]));
                    }
                } else {
                    loadProperty(toAddTo, name, attr, value, finalParameter,
                            source.toArray(new String[source.size()]));
                }
            }
        }

        if (returnCachedProperties) {
            overlay(properties, toAddTo);
            return new Resource(toAddTo, name);
        }
        return null;
    } catch (IOException e) {
        LOG.fatal("error parsing conf " + name, e);
        throw new RuntimeException(e);
    } catch (DOMException e) {
        LOG.fatal("error parsing conf " + name, e);
        throw new RuntimeException(e);
    } catch (SAXException e) {
        LOG.fatal("error parsing conf " + name, e);
        throw new RuntimeException(e);
    } catch (ParserConfigurationException e) {
        LOG.fatal("error parsing conf " + name, e);
        throw new RuntimeException(e);
    }
}

From source file:cellularAutomata.analysis.PricingDistributionAnalysis.java

/**
 * Plots the pricing data as a time series.
 *//*from w w w.j av a  2s . c o  m*/
private void plotTimeSeriesData() {
    for (int i = 0; i < numberOfStates; i++) {
        // create a list of all data points (exclude 0 if necessary)
        LinkedList<Point2D.Double> allPoints = new LinkedList<Point2D.Double>();
        int startPosition = 0;
        if (!plotZeroState) {
            startPosition = 1;
        }

        allPoints.addAll(timeSeriesOfDistributionList[i]);

        // set the min and max values on the plot
        Point2D firstPoint = (Point2D) timeSeriesOfDistributionList[i].getFirst();
        timeSeriesPlot[i].setMaximumXValue(firstPoint.getX() + MAX_NUMBER_TO_PLOT - 1);
        timeSeriesPlot[i].setMinimumXValue(0);
        timeSeriesPlot[i].setMinimumYValue(0.0);
        timeSeriesPlot[i].setXAxisLabel("Histogram");
        timeSeriesPlot[i].setYAxisLabel("% bin");
        timeSeriesPlot[i].showPlotLines(false);

        // set the max y-value
        double maxYValue = 0.0;
        Iterator<Point2D.Double> iterator = allPoints.iterator();
        while (iterator.hasNext()) {
            Point2D.Double point = iterator.next();
            if (point.y > maxYValue) {
                maxYValue = point.y;
            }
        }

        // now round up to the nearest tenth and add 0.1. This gives some
        // wiggle room before the plot will have to redraw the y-axis. If
        // redraw the y-axis too often, it will look bad. This crazy case
        // statement ensures that we don't get something like 0.7999999
        // instead
        // of 0.8 (which is what was happening when I would divide by 10.0
        // using
        // a formula to calculate the maxYValue).
        switch ((int) Math.ceil(maxYValue * 10.0)) {
        case 0:
            maxYValue = 0.1;
            break;
        case 1:
            maxYValue = 0.2;
            break;
        case 2:
            maxYValue = 0.3;
            break;
        case 3:
            maxYValue = 0.4;
            break;
        case 4:
            maxYValue = 0.5;
            break;
        case 5:
            maxYValue = 0.6;
            break;
        case 6:
            maxYValue = 0.7;
            break;
        case 7:
            maxYValue = 0.8;
            break;
        case 8:
            maxYValue = 0.9;
            break;
        case 9:
            maxYValue = 1.0;
            break;
        case 10:
            maxYValue = 1.0;
            break;
        case 11:
            maxYValue = 1.0;
            break;
        case 12:
            maxYValue = 1.0;
            break;
        }

        if (maxYValue > 1.0) {
            maxYValue = 1.0;
        }

        maxYValue = maxYValue * 1;

        timeSeriesPlot[i].setMaximumYValue(maxYValue);

        // draw some extra points on the y axes (looks good)
        int numberOfInteriorYValueLabels = (int) ((maxYValue / 20));
        double[] yValues = new double[numberOfInteriorYValueLabels];
        for (int j = 0; j < yValues.length; j++) {
            double answer = (j * 20);

            yValues[j] = answer;
        }
        timeSeriesPlot[i].setExtraYAxisValues(yValues);

        // specify colors for the points
        Color[] colorArray = new Color[allPoints.size()];
        CellStateView view = Cell.getView();

        Color stateColor = view.getDisplayColor(new IntegerCellState(i), null, new Coordinate(0, 0));

        for (int j = 0; j < timeSeriesOfDistributionList[i].size(); j++) {
            colorArray[j] = stateColor;
        }

        timeSeriesPlot[i].setPointDisplayColors(colorArray);

        // draw the points!
        timeSeriesPlot[i].drawPoints(allPoints);
    }
}

From source file:gedi.riboseq.inference.orf.OrfFinder.java

/**
 * Coordinates are in codonsRegion space!
 * @param index/*from   ww  w .j  a  v a  2 s. c  o  m*/
 * @param sequence
 * @param sg
 * @param codonsRegion
 * @return
 */
public ArrayList<OrfWithCodons> findOrfs(int index, String sequence, SpliceGraph sg,
        ImmutableReferenceGenomicRegion<IntervalTreeSet<Codon>> codonsRegion) {
    SimpleDirectedGraph<Codon> fg = new SimpleDirectedGraph<Codon>("Codongraph");

    //      if (!codonsRegion.getReference().toString().equals("chr4+") || !codonsRegion.getRegion().contains(140_283_087))
    //         return 0;

    LeftMostInFrameAndClearList buff = new LeftMostInFrameAndClearList();

    IntervalTreeSet<Codon> codons = codonsRegion.getData();
    codons.removeIf(c -> c.getTotalActivity() < minCodonActivity);
    if (codons.size() == 0)
        return new ArrayList<OrfWithCodons>();

    // add stop codons for easy orf inference
    HashSet<Codon> stopCodons = new HashSet<Codon>();
    Trie<String> stop = new Trie<String>();
    stop.put("TAG", "TAG");
    stop.put("TGA", "TGA");
    stop.put("TAA", "TAA");
    stop.iterateAhoCorasick(sequence)
            .map(r -> new Codon(new ArrayGenomicRegion(r.getStart(), r.getEnd()), r.getValue()))
            .toCollection(stopCodons);

    for (Intron intr : sg.iterateIntrons().loop()) {
        ArrayGenomicRegion reg = new ArrayGenomicRegion(intr.getStart() - 2, intr.getStart(), intr.getEnd(),
                intr.getEnd() + 1);
        String cod = stop.get(SequenceUtils.extractSequence(reg, sequence));
        if (cod != null)
            stopCodons.add(new Codon(reg, cod));

        reg = new ArrayGenomicRegion(intr.getStart() - 1, intr.getStart(), intr.getEnd(), intr.getEnd() + 2);
        cod = stop.get(SequenceUtils.extractSequence(reg, sequence));
        if (cod != null)
            stopCodons.add(new Codon(reg, cod));
    }
    stopCodons.removeAll(codons);
    codons.addAll(stopCodons);

    ArrayList<OrfWithCodons> re = new ArrayList<OrfWithCodons>();
    HashSet<Codon> usedForAnno = new HashSet<Codon>();

    if (assembleAnnotationFirst) {
        // new: first use annotated transcripts in a greedy fashion
        ArrayList<ImmutableReferenceGenomicRegion<Transcript>> transcripts = annotation.ei(codonsRegion)
                .filter(t -> t.getData().isCoding()).map(t -> codonsRegion.induce(t, "T")).list();

        int acount = 0;
        LinkedList<OrfWithCodons> orfs = new LinkedList<OrfWithCodons>();
        GenomicRegion best;
        HashSet<Codon> aremoved = new HashSet<Codon>();

        do {
            best = null;
            double bestSum = 0;
            for (ImmutableReferenceGenomicRegion<Transcript> tr : transcripts) {
                double[] a = new double[tr.getRegion().getTotalLength()];
                for (Codon c : codons) {
                    if (tr.getRegion().containsUnspliced(c)) {
                        int p = tr.induce(c.getStart());

                        assert a[p] == 0;

                        if (!aremoved.contains(c))
                            a[p] = c.totalActivity;
                        if (c.isStop())
                            a[p] = -1;
                    }
                }
                for (int f = 0; f < 3; f++) {
                    int s = -1;
                    double sum = 0;

                    for (int p = f; p < a.length; p += 3) {
                        if (a[p] == -1) {//stop
                            if (sum > bestSum) {
                                bestSum = sum;
                                best = tr.getRegion().map(new ArrayGenomicRegion(s, p + 3));
                            }
                            s = -1;
                            sum = 0;
                        } else
                            sum += a[p];

                        if (a[p] > 0 && s == -1)
                            s = p;
                    }
                }
            }
            if (best != null) {
                ArrayList<Codon> cods = new ArrayList<>();
                int uniqueCodons = 0;
                double uniqueActivity = 0;
                double totalActivity = 0;

                for (Codon c : codons) {
                    if (best.containsUnspliced(c) && best.induce(c.getStart()) % 3 == 0) {
                        if (aremoved.add(c)) {
                            uniqueActivity += c.totalActivity;
                            uniqueCodons++;
                        }
                        totalActivity += c.totalActivity;
                        if (c.totalActivity > 0)
                            cods.add(c);
                    }
                }
                //                  System.out.println(codonsRegion.map(best));
                if ((uniqueCodons >= minUniqueCodons || uniqueCodons == cods.size())
                        && uniqueActivity > minUniqueActivity && totalActivity > minOrfTotalActivity) {

                    Collections.sort(cods);
                    usedForAnno.addAll(cods);

                    OrfWithCodons orf = new OrfWithCodons(index, 0, acount++, best.toArrayGenomicRegion(), cods,
                            true);
                    orfs.add(orf);
                }

            }
        } while (best != null);

        if (orfs.size() > 1) {

            // they are not necessarily connected!
            LinkedList<OrfWithCodons>[] connected = findConnectedOrfs(orfs);
            orfs.clear();

            for (LinkedList<OrfWithCodons> corfs : connected) {
                for (boolean changed = true; changed && corfs.size() > 1;) {
                    changed = false;

                    if (useEM)
                        inferOverlappingOrfActivitiesEM(corfs);
                    else
                        overlapUniqueCoverage(corfs);

                    Iterator<OrfWithCodons> it = corfs.iterator();
                    while (it.hasNext()) {
                        OrfWithCodons orf = it.next();
                        if (orf.getEstimatedTotalActivity() < minOrfTotalActivity) {
                            it.remove();
                            changed = true;
                        }
                    }
                }

                if (corfs.size() > 1)
                    distributeCodons(corfs);
                orfs.addAll(corfs);
            }
        }
        re.addAll(orfs);
    }

    // as edges only are represented in the splice graph, singleton codons are discarded (which does make sense anyway)
    for (Codon c : codons) {
        if (!c.isStop()) {
            // find unspliced successors (can be more than one, when the successor codon itself is spliced! all of them have the same start!)
            int max = c.getEnd() + maxAminoDist * 3;
            for (Codon n : codons
                    .getIntervalsIntersecting(c.getEnd(), c.getEnd() + maxAminoDist * 3, buff.startAndClear(c))
                    .get()) {
                if (!containsInframeStop(sequence.substring(c.getEnd(), n.getStart())))
                    fg.addInteraction(c, n);
                max = n.getStart() + 2;
            }

            // find all spliced successors for each splice junction that comes before n or maxAminoDist
            sg.forEachIntronStartingBetween(c.getEnd(), max + 1, intron -> {
                for (Codon n : codons.getIntervalsIntersecting(intron.getEnd(),
                        intron.getEnd() + maxAminoDist * 3 - (intron.getStart() - c.getEnd()),
                        buff.startAndClear(c, intron)).get())
                    if (!containsInframeStop(SequenceUtils.extractSequence(new ArrayGenomicRegion(c.getStart(),
                            intron.getStart(), intron.getEnd(), n.getStart()), sequence)))
                        fg.addInteraction(c, n, intron);
            });
        }
    }

    int cc = 1;
    for (SimpleDirectedGraph<Codon> g : fg.getWeaklyConnectedComponents()) {
        if (EI.wrap(g.getSources()).mapToDouble(c -> c.getTotalActivity()).sum() == 0)
            continue;

        // iterate longest paths in g
        LinkedList<Codon> topo = g.getTopologicalOrder();
        HashSet<Codon> remInTopo = new HashSet<Codon>(topo);
        remInTopo.removeIf(c -> !stopCodons.contains(c) && !usedForAnno.contains(c));
        HashSet<Codon> removed = new HashSet<Codon>(remInTopo);

        //         double maxPathScore = 0;

        LinkedList<OrfWithCodons> orfs = new LinkedList<OrfWithCodons>();

        int count = 0;
        while (removed.size() < topo.size()) {
            HashMap<Codon, MutablePair<GenomicRegion, Double>> longestPrefixes = new HashMap<Codon, MutablePair<GenomicRegion, Double>>();
            for (Codon c : topo)
                longestPrefixes.put(c, new MutablePair<GenomicRegion, Double>(c,
                        removed.contains(c) ? 0 : (c.getTotalActivity())));

            Codon longestEnd = null;
            HashMap<Codon, Codon> backtracking = new HashMap<Codon, Codon>();

            for (Codon c : topo) {
                //               if (codonsRegion.map(c).getStart()==100_466_118)
                //                  System.out.println(c);
                //               
                //               if (codonsRegion.map(c).getStart()==100_465_842)
                //                  System.out.println(c);

                double len = longestPrefixes.get(c).Item2;
                for (AdjacencyNode<Codon> n = g.getTargets(c); n != null; n = n.next) {
                    MutablePair<GenomicRegion, Double> pref = longestPrefixes.get(n.node);

                    double nnact = removed.contains(n.node) ? 0 : (n.node.getTotalActivity());
                    if (pref.Item2 <= len + nnact) {
                        pref.set(extendFullPath(longestPrefixes.get(c).Item1, c, n.node, n.getLabel()),
                                len + nnact);
                        backtracking.put(n.node, c);
                    }
                }
                if (longestEnd == null || longestPrefixes.get(longestEnd).Item2 <= len)
                    longestEnd = c;

            }

            // determine longest path by backtracking and mark all codons on the path as removed
            ArrayList<Codon> orfCodons = new ArrayList<Codon>();
            double totalActivity = 0;
            double uniqueActivity = 0;
            int uniqueCodons = 0;
            for (Codon c = longestEnd; c != null; c = backtracking.get(c)) {
                if (removed.add(c) && c.getTotalActivity() > 0) {
                    uniqueCodons++;
                    uniqueActivity += c.getTotalActivity();
                }

                if (c.getTotalActivity() > 0) // to remove dummy stop codons
                    orfCodons.add(c);
                totalActivity += c.getTotalActivity();
            }

            //            System.out.println(codonsRegion.map(longestPrefixes.get(longestEnd).Item1));

            if ((uniqueCodons >= minUniqueCodons || uniqueCodons == orfCodons.size())
                    && uniqueActivity > minUniqueActivity && totalActivity > minOrfTotalActivity) {
                Collections.reverse(orfCodons);

                MutablePair<GenomicRegion, Double> triple = longestPrefixes.get(longestEnd);
                ArrayGenomicRegion region = triple.Item1.toArrayGenomicRegion();
                String lastCodon = SequenceUtils.extractSequence(
                        region.map(
                                new ArrayGenomicRegion(region.getTotalLength() - 3, region.getTotalLength())),
                        sequence);

                OrfWithCodons orf = new OrfWithCodons(index, cc, count++, region, orfCodons,
                        stop.containsKey(lastCodon));
                orfs.add(orf);
            }

            //            maxPathScore = Math.max(maxPathScore,totalActivity);
        }

        if (orfs.size() > 1) {

            // they are not necessarily connected!

            LinkedList<OrfWithCodons>[] connected = findConnectedOrfs(orfs);
            orfs.clear();

            for (LinkedList<OrfWithCodons> corfs : connected) {
                for (boolean changed = true; changed && corfs.size() > 1;) {
                    changed = false;

                    if (useEM)
                        inferOverlappingOrfActivitiesEM(corfs);
                    else
                        overlapUniqueCoverage(corfs);

                    Iterator<OrfWithCodons> it = corfs.iterator();
                    while (it.hasNext()) {
                        OrfWithCodons orf = it.next();
                        if (orf.getEstimatedTotalActivity() < minOrfTotalActivity) {
                            it.remove();
                            changed = true;
                        }
                    }
                }

                if (corfs.size() > 1)
                    distributeCodons(corfs);
                orfs.addAll(corfs);
            }

        }

        re.addAll(orfs);

        cc++;
    }

    return re;

}

From source file:edu.harvard.iq.dvn.ingest.dsb.impl.DvnRJobRequest.java

public List<List<String>> getValueRange(String tkn) {

    dbgLog.fine("received token=" + tkn);
    String step0 = StringUtils.strip(tkn);
    dbgLog.fine("step0=" + step0);

    // string into tokens
    String[] step1raw = step0.split(",");

    dbgLog.fine("step1raw=" + StringUtils.join(step1raw, ","));

    // remove meaningless commas if exist

    List<String> step1 = new ArrayList<String>();

    for (String el : step1raw) {
        if (!el.equals("")) {
            step1.add(el);//from   ww  w  .j ava  2s .c  o m
        }
    }

    dbgLog.fine("step1=" + StringUtils.join(step1, ","));

    List<List<String>> rangeData = new ArrayList<List<String>>();

    // for each token, check the range operator

    for (int i = 0; i < step1.size(); i++) {
        LinkedList<String> tmp = new LinkedList<String>(
                Arrays.asList(String2StringArray(String.valueOf(step1.get(i)))));

        Map<String, String> token = new HashMap<String, String>();
        boolean rangeMode = false;

        // .get(i) below CAN'T possibly be right (??) -- replacing
        // it with .get(0). -- L.A., v3.6
        //if ((!tmp.get(i).equals("[")) && (!tmp.get(i).equals("("))){
        if ((!tmp.get(0).equals("[")) && (!tmp.get(0).equals("("))) {
            // no LHS range operator
            // assume [
            token.put("start", "3");
        } else if (tmp.get(0).equals("[")) {
            rangeMode = true;
            token.put("start", "3");
            tmp.removeFirst();
        } else if (tmp.get(0).equals("(")) {
            rangeMode = true;
            token.put("start", "5");
            tmp.removeFirst();
        }

        if ((!tmp.getLast().equals("]")) && (!tmp.getLast().equals(")"))) {
            // no RHS range operator
            // assume ]
            token.put("end", "4");
        } else if (tmp.getLast().equals("]")) {
            rangeMode = true;
            tmp.removeLast();
            token.put("end", "4");
        } else if (tmp.getLast().equals(")")) {
            rangeMode = true;
            tmp.removeLast();
            token.put("end", "6");
        }

        // I'm now enforcing the following rules:
        // the "rangeMode" above - a range must have at least one range
        // operator, a square bracket or parenthesis, on one end, at
        // least; i.e., either on the left, or on the right. 
        // If there are no range operators, even if there are dashes
        // inside the token, they are not going to be interpreted as 
        // range definitions.  
        // still TODO: (possibly?) add more validation; figure out how 
        // to encode *date* ranges ("-" is not optimal, since dates already
        // contain dashes... although, since dates are (supposed to be) 
        // normalized it should still be possible to parse it unambiguously)
        //          -- L.A., v3.6

        if (rangeMode) {
            // after these steps, the string does not have range operators;
            // i.e., '-9--3', '--9', '-9-','-9', '-1-1', '1', '3-4', '6-'

            if ((tmp.get(0).equals("!")) && (tmp.get(1).equals("="))) {
                // != negation string is found
                token.put("start", "2");
                token.put("end", "");
                token.put("v1", StringUtils.join(tmp.subList(2, tmp.size()), ""));
                token.put("v2", "");
                dbgLog.fine("value=" + StringUtils.join(tmp.subList(2, tmp.size()), ","));

            } else if ((tmp.get(0).equals("-")) && (tmp.get(1).equals("-"))) {
                // type 2: --9
                token.put("v1", "");
                tmp.removeFirst();
                token.put("v2", StringUtils.join(tmp, ""));
            } else if ((tmp.get(0).equals("-")) && (tmp.getLast().equals("-"))) {
                // type 3: -9-
                token.put("v2", "");
                tmp.removeLast();
                token.put("v1", StringUtils.join(tmp, ""));
            } else if ((!tmp.get(0).equals("-")) && (tmp.getLast().equals("-"))) {
                // type 8: 6-
                token.put("v2", "");
                tmp.removeLast();
                token.put("v1", StringUtils.join(tmp, ""));
            } else {
                int count = 0;
                List<Integer> index = new ArrayList<Integer>();
                for (int j = 0; j < tmp.size(); j++) {
                    if (tmp.get(j).equals("-")) {
                        count++;
                        index.add(j);
                    }
                }

                if (count >= 2) {
                    // range type
                    // divide the second hyphen
                    // types 1 and 5: -9--3, -1-1
                    // token.put("v1", StringUtils.join(tmp[0..($index[1]-1)],"" ));
                    token.put("v2", StringUtils.join(tmp.subList((index.get(1) + 1), tmp.size()), ""));

                } else if (count == 1) {
                    if (tmp.get(0).equals("-")) {
                        // point negative type
                        // type 4: -9 or -inf,9
                        // do nothing
                        if ((token.get("start").equals("5"))
                                && ((token.get("end").equals("6")) || (token.get("end").equals("4")))) {
                            token.put("v1", "");
                            tmp.removeFirst();
                            token.put("v2", StringUtils.join(tmp, ""));
                        } else {
                            token.put("v1", StringUtils.join(tmp, ""));
                            token.put("v2", StringUtils.join(tmp, ""));
                        }
                    } else {
                        // type 7: 3-4
                        // both positive value and range type
                        String[] vset = (StringUtils.join(tmp, "")).split("-");
                        token.put("v1", vset[0]);
                        token.put("v2", vset[1]);
                    }

                } else {
                    // type 6: 1
                    token.put("v1", StringUtils.join(tmp, ""));
                    token.put("v2", StringUtils.join(tmp, ""));
                }
            }
        } else {
            // assume that this is NOT a range; treat the entire sequence 
            // of symbols as a single token:
            // type 6: 1
            token.put("v1", StringUtils.join(tmp, ""));
            token.put("v2", StringUtils.join(tmp, ""));
        }

        dbgLog.fine(i + "-th result=" + token.get("start") + "|" + token.get("v1") + "|" + token.get("end")
                + "|" + token.get("v2"));

        List<String> rangeSet = new ArrayList<String>();
        rangeSet.add(token.get("start"));
        rangeSet.add(token.get("v1"));
        rangeSet.add(token.get("end"));
        rangeSet.add(token.get("v2"));
        rangeData.add(rangeSet);

    }

    dbgLog.fine("rangeData:\n" + rangeData);
    return rangeData;
}

From source file:hr.fer.spocc.regex.AbstractRegularExpression.java

protected RegularExpression<T> createParseTree(List<RegularExpressionElement> elements) {

    //      System.out.println(">>> Parsing regexp: "+elements);

    /**/*from   w ww .  jav  a 2  s . co m*/
     * Stack which contains parts of regular expression 
     * which are not yet used
     * by the operator. In addition, <code>null</code> values
     * can be pushed onto this stack to indicate that 
     * the symbols to the right are grouped by the parenthesis.
     * 
     */
    LinkedList<RegularExpression<T>> symbolStack = new LinkedList<RegularExpression<T>>();

    /**
     * Operator stack
     */
    LinkedList<RegularExpressionOperator> opStack = new LinkedList<RegularExpressionOperator>();

    boolean sentinelParentheses = false;

    //      if (this.elements.get(0).getElementType() 
    //            != RegularExpressionElementType.LEFT_PARENTHESIS
    //         || this.elements.get(elements.size()-1).getElementType()
    //         != RegularExpressionElementType.RIGHT_PARENTHESIS) {
    sentinelParentheses = true;
    symbolStack.push(null);
    opStack.push(null);
    //      }

    int ind = -1;

    Iterator<RegularExpressionElement> iter = elements.iterator();
    while (iter.hasNext() || sentinelParentheses) {
        ++ind;
        RegularExpressionElement e;
        if (iter.hasNext()) {
            e = iter.next();
        } else { // osiguraj dodatnu iteraciju za umjetnu zadnju )
            e = RegularExpressionElements.RIGHT_PARENTHESIS;
            sentinelParentheses = false;
        }

        switch (e.getElementType()) {
        case SYMBOL:
            symbolStack.push(createTrivial(elements.subList(ind, ind + 1)));
            break;
        default:
            RegularExpressionOperator curOp = (e.getElementType() == RegularExpressionElementType.OPERATOR
                    ? (RegularExpressionOperator) e
                    : null);

            int priority = (curOp != null ? curOp.getPriority() : -1);

            if (e.getElementType() != RegularExpressionElementType.LEFT_PARENTHESIS) {

                //               System.out.println("Pre-while symbolStack: "+symbolStack);

                while (!opStack.isEmpty() && opStack.getFirst() != null
                        && opStack.getFirst().getPriority() >= priority && symbolStack.getFirst() != null) {

                    RegularExpressionOperator op = opStack.pop();
                    int arity = op.getArity();
                    int elementCount = 0;

                    //                  System.out.println("POP: "+op);

                    @SuppressWarnings("unchecked")
                    RegularExpression<T>[] operands = new RegularExpression[arity];
                    for (int i = arity - 1; i >= 0; --i) {
                        if (symbolStack.isEmpty()) {
                            throw new IllegalArgumentException("Missing ( after");
                        } else if (symbolStack.getFirst() == null) {
                            throw new IllegalArgumentException("Missing operand #" + (arity - i)
                                    + " for the operator " + op + " before index " + ind);
                        }
                        operands[i] = symbolStack.pop();
                        elementCount += operands[i].size();
                    }

                    RegularExpression<T> regex = createComposite(elements.subList(ind - elementCount - 1, ind),
                            op, operands);

                    //                  System.err.println(regex);
                    //                  System.err.println(regex.getSubexpression(0));

                    symbolStack.push(regex);

                    //                  System.out.println("Group: "+
                    //                        ArrayToStringUtils.toString(operands, "\n"));
                    //                  System.out.println("End group");
                    //                  System.out.println("Evaluated [" + (ind-elementCount-1)
                    //                        + ", " + ind + "): "+regex);
                    //                  System.out.println("Symbol stack: "+symbolStack);
                    //                  System.out.println("Op stack: "+opStack);
                    //                  System.out.println("---");

                }
            }

            if (curOp != null) {
                opStack.push(curOp);
            } else {
                switch (e.getElementType()) {
                case LEFT_PARENTHESIS:
                    symbolStack.push(null);
                    opStack.push(null);
                    break;
                default: // ako je )
                    Validate.isTrue(symbolStack.size() >= 2,
                            "Exactly one expression is expected " + "inside parentheses before index " + ind);

                    // pop left bracket (null) from the operator stack
                    Object nullValue = opStack.pop();
                    Validate.isTrue(nullValue == null);

                    // pop left bracket (null) from the symbol stack
                    RegularExpression<T> regex = symbolStack.pop();
                    nullValue = symbolStack.pop();

                    // check if left bracket was removed indeed
                    //                  Validate.isTrue(nullValue == null, 
                    //                        "Expected ( at index " + (ind-regex.size()-1));

                    // expand the expression if parentheses are not sentinel
                    if (sentinelParentheses) { // XXX neki drugi flag bolje
                        //                     System.out.print("Expand [" 
                        //                           + (ind - regex.size() - 1) + ", "
                        //                           + (ind + 1) + "]: ");
                        //                     System.out.println("[regex size = "+regex.size()
                        //                           + "]");
                        regex = createExpanded(regex, elements.subList(ind - regex.size() - 1, ind + 1));

                        //                     System.out.println(" -> "+regex);
                    }

                    // and put back the expression inside parentheses
                    symbolStack.push(regex);
                }
            }

        } // end of switch

        //         System.out.println("----- " + ind + " ----");
        //         System.out.println("Symbol stack: "+symbolStack);
        //         System.out.println("Op stack: "+opStack);
    }

    //Validate.isTrue(symbolStack.size() == 1);
    //Validate.isTrue(opStack.isEmpty());

    return symbolStack.pop();
}

From source file:com.edgenius.wiki.render.filter.MacroFilter.java

/**
 *//*w  w  w . j  a  v a 2 s  .c om*/
private void checkGroup(final int initPos, CharSequence input, final LinkedList<GroupProcessor> stack,
        List<GroupProcessor> processors) {
    final List<Region> pairRegions = new ArrayList<Region>();

    singleMacroProvider.replaceByTokenVisitor(input, new TokenVisitor<Matcher>() {
        public void handleMatch(StringBuffer buffer, Matcher result) {
            String macroName = result.group(1);
            if (macroName != null && !macroName.startsWith("$")) {
                Macro macro = macroMgr.getMacro(macroName);

                if (macro != null) {
                    //IMPORTANT: here does not check Macro.isPair() and also put it into pairRegions for following process
                    //it is the sequence of process must keep consistant with physical sequence in input text, 
                    //for example, {table}{cell}...{rowdiv}, rowdiv is single and must be after cell
                    int start = result.start(0);
                    int end = result.end(0);
                    Region pair = new Region(start, end);
                    //no parameter, then mark as unknown, otherwise, must be a start macro
                    if (StringUtils.isBlank(result.group(2))) {
                        pair.setKey(MACRO_REGION_KEY_UNKNOWN);
                    } else {
                        pair.setKey(MACRO_REGION_KEY_START);
                    }

                    //just for temporary to remember the macro name...
                    pair.setContent(macroName);
                    //sum to list
                    pairRegions.add(pair);
                }
            }
        }
    });

    int size = pairRegions.size();
    if (size > 0) {
        StringBuffer inputBuf = new StringBuffer(input);
        for (int idx = 0; idx < size; idx++) {
            Region reg = pairRegions.get(idx);
            Macro macro = macroMgr.getMacro(reg.getContent());
            if (macro.isPaired()) {
                int deep = 0;
                Region pair = null;
                //looking for pairs...
                for (int chIdx = idx + 1; chIdx < size; chIdx++) {
                    Region next = pairRegions.get(chIdx);
                    if (StringUtils.equalsIgnoreCase(reg.getContent(), next.getContent())) {
                        //start is unknown (no attribute), then end must be unknown
                        if (MACRO_REGION_KEY_UNKNOWN.equals(reg.getKey())
                                && MACRO_REGION_KEY_UNKNOWN.equals(next.getKey())) {
                            //matched
                            pair = next;
                            //skip all internal node - which is handle by embedded recursive
                            idx = chIdx;
                            break;
                        }

                        if (MACRO_REGION_KEY_START.equals(reg.getKey())
                                && MACRO_REGION_KEY_UNKNOWN.equals(next.getKey())) {
                            if (deep == 0) {
                                //matched;
                                pair = next;
                                //skip all internal node - which is handle by embedded recursive
                                idx = chIdx;
                                break;
                            } else {
                                //just another inner same name macro matched, deep minus 1
                                deep--;
                            }
                        }
                        if (MACRO_REGION_KEY_START.equals(next.getKey())) {
                            //ok, it gets another start, in 4th scenarios - then add deep
                            deep++;
                        }
                    }
                }
                //ok, success find paired
                if (pair != null) {
                    int start = initPos + reg.getStart();
                    int end = initPos + pair.getEnd();
                    int contentStart = initPos + reg.getEnd();
                    int contentEnd = initPos + pair.getStart();

                    GroupProcessor currProcessor = stack.size() == 0 ? null : stack.getLast();
                    if (currProcessor != null) {
                        currProcessor.adoptChild(macro, start, end);
                    }

                    if (macro.isProcessEmbedded() && (end > start)) {
                        if (macro.hasChildren() != null) {
                            stack.add(((GroupProcessorMacro) macro).newGroupProcessor(macro, start, end));
                        }
                        checkGroup(contentStart,
                                inputBuf.subSequence(contentStart - initPos, contentEnd - initPos), stack,
                                processors);
                        if (macro.hasChildren() != null) {
                            //pop the current one, means it is a completed GroupProcessor
                            processors.add(stack.removeLast());
                        }
                    }
                }
            } else {
                //single macro - directly detect if it is child
                GroupProcessor currProcessor = stack.size() == 0 ? null : stack.getLast();
                if (currProcessor != null) {
                    currProcessor.adoptChild(macro, initPos + reg.getStart(), initPos + reg.getEnd());
                }
            }
        }
    }
}

From source file:elh.eus.absa.Features.java

/**
 *   Function fills the attribute vectors for the instances existing in the corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //from www  .j  av a 2 s  .co  m
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstances(boolean save, String prefix) throws IOException {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //Properties posProp = new Properties();
    //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);      
    if (params.containsKey("lemmaNgrams")) {
        Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"),
                corpus.getLang(), "3", "bin", "false");

        postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        // string normalization (emoticons, twitter grammar,...)
        String opNormalized = corpus.getOpinionSentence(oId);

        // compute uppercase ratio before normalization (if needed)      
        double upRatio = 0.0;
        if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) {
            String upper = opNormalized.replaceAll("[\\p{Ll}]", "");
            upRatio = (double) upper.length() / (double) opNormalized.length();
            values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        }

        // string normalization (emoticons, twitter grammar,...)
        if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams"))
                && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) {
            opNormalized = normalize(opNormalized, params.getProperty("normalization", "none"));
        }

        //process the current instance with the NLP pipeline in order to get token and lemma|pos features
        KAFDocument nafinst = new KAFDocument("", "");
        String nafname = trainExamples.get(oId).getsId().replace(':', '_');
        String nafDir = params.getProperty("kafDir");
        String nafPath = nafDir + File.separator + nafname + ".kaf";
        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = 1;
        try {
            if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty()))
            {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(),
                            params.getProperty("pos-model"), postagger);
                    Files.createDirectories(Paths.get(nafDir));
                    nafinst.save(nafPath);
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));
            } else {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang());
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));

            }
        } catch (IOException | JDOMException e) {
            System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId
                    + "|" + oId + ") for filling the attribute vector");
            e.printStackTrace();
            System.exit(5);
        }

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        List<WF> window = nafinst.getWFs();
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer to = window.size();
            Integer from = 0;
            end++;
            for (int i = 0; i < window.size(); i++) {
                WF wf = window.get(i);
                if ((wf.getOffset() == start) && (i >= bowWin)) {
                    from = i - bowWin;
                } else if (wf.getOffset() >= end) {
                    if (i + bowWin < window.size()) {
                        to = i + bowWin;
                    }
                    break;
                }
            }
            window = window.subList(from, to);
            //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to);
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        List<String> windowWFIds = new ArrayList<String>();

        // word form ngram related features
        for (WF wf : window) {
            windowWFIds.add(wf.getId());

            String wfStr = wf.getForm();
            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum

            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (Term t : nafinst.getTermsFromWFs(windowWFIds)) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    String lemma = t.getLemma();

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // add ngrams to the feature vector
                    for (int i = 0; i < ngrams.size(); i++) {
                        String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma");
                        //if the current lemma is in the ngram list activate the feature in the vector
                        if (params.containsKey("lemmaNgrams")
                                && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                            Attribute ngAtt = rsltdata.attribute(ng);
                            if (ngAtt != null) {
                                addNumericToFeatureVector(ng, values, 1); //tokNum                     
                            }
                        }

                        ng = featureFromArray(ngrams.subList(0, i + 1), "");
                        if (params.containsKey("polarLexiconGeneral")
                                || params.containsKey("polarLexiconDomain")) {
                            checkPolarityLexicons(ng, values, tokNum, polNgrams);
                        } //end polarity ngram checker
                    } //end ngram checking                                      
                }
                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }
                    posNgrams.add(t.getPos());

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            while (!ngrams.isEmpty()) {
                String ng = featureFromArray(ngrams, "lemma");

                //if the current lemma is in the ngram list activate the feature in the vector
                if (rsltdata.attribute(ng) != null) {
                    addNumericToFeatureVector(ng, values, 1); //tokNum
                }

                // polarity lexicons
                if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
                    checkPolarityLexicons(ng, values, tokNum, polNgrams);
                } //end polarity ngram checker

                ngrams.removeFirst();
            }

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId));
            if (pol != null && !pol.isEmpty()) {
                //System.err.println("polarity: _"+pol+"_");
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstances() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }
    return rsltdata;
}

From source file:com.joliciel.talismane.TalismaneImpl.java

public void analyse(TalismaneConfig config) {
    try {/*from   www  . j ava 2 s  . com*/
        if (config.needsSentenceDetector()) {
            if (config.getSentenceDetector() == null) {
                throw new TalismaneException("Sentence detector not provided.");
            }
        }
        if (config.needsTokeniser()) {
            if (config.getTokeniser() == null) {
                throw new TalismaneException("Tokeniser not provided.");
            }
        }
        if (config.needsPosTagger()) {
            if (config.getPosTagger() == null) {
                throw new TalismaneException("Pos-tagger not provided.");
            }
        }
        if (config.needsParser()) {
            if (config.getParser() == null) {
                throw new TalismaneException("Parser not provided.");
            }
        }

        if (config.getEndModule().equals(Module.SentenceDetector)) {
            if (this.getSentenceProcessor() == null) {
                throw new TalismaneException(
                        "No sentence processor provided with sentence detector end module, cannot generate output.");
            }
        }
        if (config.getEndModule().equals(Module.Tokeniser)) {
            if (this.getTokenSequenceProcessor() == null) {
                throw new TalismaneException(
                        "No token sequence processor provided with tokeniser end module, cannot generate output.");
            }
        }
        if (config.getEndModule().equals(Module.PosTagger)) {
            if (this.getPosTagSequenceProcessor() == null) {
                throw new TalismaneException(
                        "No postag sequence processor provided with pos-tagger end module, cannot generate output.");
            }
        }
        if (config.getEndModule().equals(Module.Parser)) {
            if (this.getParseConfigurationProcessor() == null) {
                throw new TalismaneException(
                        "No parse configuration processor provided with parser end module, cannot generate output.");
            }
        }

        LinkedList<String> textSegments = new LinkedList<String>();
        LinkedList<Sentence> sentences = new LinkedList<Sentence>();
        TokenSequence tokenSequence = null;
        PosTagSequence posTagSequence = null;

        RollingSentenceProcessor rollingSentenceProcessor = this.getFilterService()
                .getRollingSentenceProcessor(config.getFileName(), config.isProcessByDefault());
        Sentence leftover = null;
        if (config.getStartModule().equals(Module.SentenceDetector)
                || config.getStartModule().equals(Module.Tokeniser)) {
            // prime the sentence detector with two text segments, to ensure everything gets processed
            textSegments.addLast("");
            textSegments.addLast("");
        }

        StringBuilder stringBuilder = new StringBuilder();
        boolean finished = false;
        int sentenceCount = 0;

        String prevProcessedText = "";
        String processedText = "";
        String nextProcessedText = "";
        SentenceHolder prevSentenceHolder = null;

        int endBlockCharacterCount = 0;

        while (!finished) {
            if (config.getStartModule().equals(Module.SentenceDetector)
                    || config.getStartModule().equals(Module.Tokeniser)) {
                // Note SentenceDetector and Tokeniser start modules treated identically,
                // except that for SentenceDetector we apply a probabilistic sentence detector
                // whereas for Tokeniser we assume all sentence breaks are marked by filters

                // read characters from the reader, one at a time
                char c;
                int r = -1;
                try {
                    r = this.getReader().read();
                } catch (IOException e) {
                    LogUtils.logError(LOG, e);
                }

                if (r == -1) {
                    finished = true;
                    c = '\n';
                } else {
                    c = (char) r;
                }

                // Jump out if we have 3 consecutive end-block characters.
                if (c == config.getEndBlockCharacter()) {
                    endBlockCharacterCount++;
                    if (endBlockCharacterCount == 3) {
                        LOG.info("Three consecutive end-block characters. Exiting.");
                        finished = true;
                    }
                } else {
                    endBlockCharacterCount = 0;
                }

                // have sentence detector
                if (finished || (Character.isWhitespace(c) && stringBuilder.length() > config.getBlockSize())
                        || c == config.getEndBlockCharacter()) {
                    if (c == config.getEndBlockCharacter())
                        stringBuilder.append(c);
                    if (stringBuilder.length() > 0) {
                        String textSegment = stringBuilder.toString();
                        stringBuilder = new StringBuilder();

                        textSegments.add(textSegment);
                    } // is the current block > 0 characters?
                    if (c == config.getEndBlockCharacter()) {
                        textSegments.addLast("");
                    }
                } // is there a next block available?

                if (finished) {
                    if (stringBuilder.length() > 0) {
                        textSegments.addLast(stringBuilder.toString());
                        stringBuilder = new StringBuilder();
                    }
                    textSegments.addLast("");
                    textSegments.addLast("");
                    textSegments.addLast("");
                }

                if (c != config.getEndBlockCharacter())
                    stringBuilder.append(c);

                while (textSegments.size() >= 3) {
                    String prevText = textSegments.removeFirst();
                    String text = textSegments.removeFirst();
                    String nextText = textSegments.removeFirst();
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("prevText: " + prevText);
                        LOG.trace("text: " + text);
                        LOG.trace("nextText: " + nextText);
                    }

                    Set<TextMarker> textMarkers = new TreeSet<TextMarker>();
                    for (TextMarkerFilter textMarkerFilter : config.getTextMarkerFilters()) {
                        Set<TextMarker> result = textMarkerFilter.apply(prevText, text, nextText);
                        textMarkers.addAll(result);
                    }

                    // push the text segments back onto the beginning of Deque
                    textSegments.addFirst(nextText);
                    textSegments.addFirst(text);

                    SentenceHolder sentenceHolder = rollingSentenceProcessor.addNextSegment(text, textMarkers);
                    prevProcessedText = processedText;
                    processedText = nextProcessedText;
                    nextProcessedText = sentenceHolder.getText();

                    if (LOG.isTraceEnabled()) {
                        LOG.trace("prevProcessedText: " + prevProcessedText);
                        LOG.trace("processedText: " + processedText);
                        LOG.trace("nextProcessedText: " + nextProcessedText);
                    }

                    boolean reallyFinished = finished && textSegments.size() == 3;

                    if (prevSentenceHolder != null) {
                        if (config.getStartModule().equals(Module.SentenceDetector)) {
                            List<Integer> sentenceBreaks = config.getSentenceDetector()
                                    .detectSentences(prevProcessedText, processedText, nextProcessedText);
                            for (int sentenceBreak : sentenceBreaks) {
                                prevSentenceHolder.addSentenceBoundary(sentenceBreak);
                            }
                        }

                        List<Sentence> theSentences = prevSentenceHolder.getDetectedSentences(leftover);
                        leftover = null;
                        for (Sentence sentence : theSentences) {
                            if (sentence.isComplete() || reallyFinished) {
                                sentences.add(sentence);
                                sentenceCount++;
                            } else {
                                LOG.debug("Setting leftover to: " + sentence.getText());
                                leftover = sentence;
                            }
                        }
                        if (config.getMaxSentenceCount() > 0 && sentenceCount >= config.getMaxSentenceCount()) {
                            finished = true;
                        }
                    }
                    prevSentenceHolder = sentenceHolder;
                } // we have at least 3 text segments (should always be the case once we get started)
            } else if (config.getStartModule().equals(Module.PosTagger)) {
                if (config.getTokenCorpusReader().hasNextTokenSequence()) {
                    tokenSequence = config.getTokenCorpusReader().nextTokenSequence();
                } else {
                    tokenSequence = null;
                    finished = true;
                }
            } else if (config.getStartModule().equals(Module.Parser)) {
                if (config.getPosTagCorpusReader().hasNextPosTagSequence()) {
                    posTagSequence = config.getPosTagCorpusReader().nextPosTagSequence();
                } else {
                    posTagSequence = null;
                    finished = true;
                }
            } // which start module?

            boolean needToProcess = false;
            if (config.getStartModule().equals(Module.SentenceDetector)
                    || config.getStartModule().equals(Module.Tokeniser))
                needToProcess = !sentences.isEmpty();
            else if (config.getStartModule().equals(Module.PosTagger))
                needToProcess = tokenSequence != null;
            else if (config.getStartModule().equals(Module.Parser))
                needToProcess = posTagSequence != null;

            while (needToProcess) {
                Sentence sentence = null;
                if (config.getStartModule().compareTo(Module.Tokeniser) <= 0
                        && config.getEndModule().compareTo(Module.SentenceDetector) >= 0) {
                    sentence = sentences.poll();
                    LOG.debug("Sentence: " + sentence);
                    if (this.getSentenceProcessor() != null)
                        this.getSentenceProcessor().onNextSentence(sentence.getText(), this.getWriter());
                } // need to read next sentence

                List<TokenSequence> tokenSequences = null;
                if (config.needsTokeniser()) {
                    tokenSequences = config.getTokeniser().tokenise(sentence);
                    tokenSequence = tokenSequences.get(0);

                    if (this.getTokenSequenceProcessor() != null) {
                        this.getTokenSequenceProcessor().onNextTokenSequence(tokenSequence, this.getWriter());
                    }
                } // need to tokenise ?

                List<PosTagSequence> posTagSequences = null;
                if (config.needsPosTagger()) {
                    posTagSequence = null;
                    if (tokenSequences == null || !config.isPropagateTokeniserBeam()) {
                        tokenSequences = new ArrayList<TokenSequence>();
                        tokenSequences.add(tokenSequence);
                    }

                    if (config.getPosTagger() instanceof NonDeterministicPosTagger) {
                        NonDeterministicPosTagger nonDeterministicPosTagger = (NonDeterministicPosTagger) config
                                .getPosTagger();
                        posTagSequences = nonDeterministicPosTagger.tagSentence(tokenSequences);
                        posTagSequence = posTagSequences.get(0);
                    } else {
                        posTagSequence = config.getPosTagger().tagSentence(tokenSequence);
                    }

                    if (posTagSequenceProcessor != null) {
                        posTagSequenceProcessor.onNextPosTagSequence(posTagSequence, this.getWriter());
                    }

                    tokenSequence = null;
                } // need to postag

                if (config.needsParser()) {
                    if (posTagSequences == null || !config.isPropagatePosTaggerBeam()) {
                        posTagSequences = new ArrayList<PosTagSequence>();
                        posTagSequences.add(posTagSequence);
                    }

                    ParseConfiguration parseConfiguration = null;
                    List<ParseConfiguration> parseConfigurations = null;
                    try {
                        if (config.getParser() instanceof NonDeterministicParser) {
                            NonDeterministicParser nonDeterministicParser = (NonDeterministicParser) config
                                    .getParser();
                            parseConfigurations = nonDeterministicParser.parseSentence(posTagSequences);
                            parseConfiguration = parseConfigurations.get(0);
                        } else {
                            parseConfiguration = config.getParser().parseSentence(posTagSequence);
                        }

                        if (this.getParseConfigurationProcessor() != null) {
                            this.getParseConfigurationProcessor().onNextParseConfiguration(parseConfiguration,
                                    this.getWriter());
                        }
                    } catch (Exception e) {
                        LOG.error(e);
                        if (stopOnError)
                            throw new RuntimeException(e);
                    }
                    posTagSequence = null;
                } // need to parse

                if (config.getStartModule().equals(Module.SentenceDetector)
                        || config.getStartModule().equals(Module.Tokeniser))
                    needToProcess = !sentences.isEmpty();
                else if (config.getStartModule().equals(Module.PosTagger))
                    needToProcess = tokenSequence != null;
                else if (config.getStartModule().equals(Module.Parser))
                    needToProcess = posTagSequence != null;
            } // next sentence
        } // next character
    } finally {
        if (this.getParseConfigurationProcessor() != null) {
            this.getParseConfigurationProcessor().onCompleteParse();
        }

        try {
            this.getReader().close();
            this.getWriter().flush();
            this.getWriter().close();
        } catch (IOException ioe2) {
            LOG.error(ioe2);
            throw new RuntimeException(ioe2);
        }

    }
}

From source file:elh.eus.absa.Features.java

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //w  ww .j  a va2  s .co m
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesTAB(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String[] noWindow = corpus.getOpinionSentence(oId).split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\t");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}