List of usage examples for java.util PriorityQueue poll
public E poll()
From source file:com.addthis.hydra.data.io.DiskBackedList2.java
/** * Sort the collection of elements using a standard external sort algorithm: sort each chunk of elements, then * merge the chunks into a new list, then switch to the new list. *//*from w w w . j a v a 2 s.c o m*/ public void sort(final Comparator<? super K> comp) { try { // Sort each chunk. Done if there is only one chunk. sortEachChunk(comp); if (chunks.size() <= 1) { return; } Comparator<Pair<K, Integer>> pairComp = new Comparator<Pair<K, Integer>>() { @Override public int compare(Pair<K, Integer> e1, Pair<K, Integer> e2) { return comp.compare(e1.getLeft(), e2.getLeft()); } }; // This heap stores the lowest remaining value from each chunk PriorityQueue<Pair<K, Integer>> heap = new PriorityQueue<>(chunks.size(), pairComp); ArrayList<Iterator> iterators = new ArrayList<>(chunks.size()); // Initialize the heap with one value per chunk close(); for (int i = 0; i < chunks.size(); i++) { Iterator<K> it = chunks.get(i).getChunkIterator(); iterators.add(i, it); if (it.hasNext()) { K elt = it.next(); if (elt != null) { heap.add(Pair.of(elt, i)); } } } // Make a new disk backed list to store sorted values. // When the number of chunks is large, the size of the output buffer needs to shrink to make up for the extra mem usage long storageMaxChunkSize = maxChunkSizeBytes / (1 + chunks.size() / 20); DiskBackedList2<K> storage = new DiskBackedList2<>(codec, storageMaxChunkSize, directory); // Repeatedly pull the smallest element from the heap while (!heap.isEmpty()) { Pair<K, Integer> leastElt = heap.poll(); storage.add(leastElt.getLeft()); @SuppressWarnings({ "unchecked" }) Iterator<K> polledIterator = iterators.get(leastElt.getRight()); if (polledIterator.hasNext()) { heap.add(Pair.of(polledIterator.next(), leastElt.getRight())); } } // Switch to the storage dbl's chunks storage.close(); chunks = storage.getChunks(); currentChunk = null; } catch (IOException io) { throw Throwables.propagate(io); } }
From source file:org.apache.sysml.runtime.compress.CompressedMatrixBlock.java
private static ColGroup compressColGroup(MatrixBlock in, CompressedSizeEstimator estim, HashMap<Integer, Double> compRatios, int rlen, double sp, int[] colIndexes) { int[] allGroupIndices = null; int allColsCount = colIndexes.length; CompressedSizeInfo sizeInfo;/*from w w w . j a v a2 s . co m*/ // The compression type is decided based on a full bitmap since it // will be reused for the actual compression step. UncompressedBitmap ubm = null; PriorityQueue<CompressedColumn> compRatioPQ = null; boolean skipGroup = false; while (true) { //exact big list and observe compression ratio ubm = BitmapEncoder.extractBitmap(colIndexes, in); sizeInfo = estim.estimateCompressedColGroupSize(ubm); double compRatio = getUncompressedSize(rlen, colIndexes.length, sp) / sizeInfo.getMinSize(); if (compRatio > 1) { break; // we have a good group } // modify the group if (compRatioPQ == null) { // first modification allGroupIndices = colIndexes.clone(); compRatioPQ = new PriorityQueue<CompressedMatrixBlock.CompressedColumn>(); for (int i = 0; i < colIndexes.length; i++) compRatioPQ.add(new CompressedColumn(i, compRatios.get(colIndexes[i]))); } // index in allGroupIndices int removeIx = compRatioPQ.poll().colIx; allGroupIndices[removeIx] = -1; allColsCount--; if (allColsCount == 0) { skipGroup = true; break; } colIndexes = new int[allColsCount]; // copying the values that do not equal -1 int ix = 0; for (int col : allGroupIndices) if (col != -1) colIndexes[ix++] = col; } //add group to uncompressed fallback if (skipGroup) return null; //create compressed column group long rleSize = sizeInfo.getRLESize(); long oleSize = sizeInfo.getOLESize(); if (rleSize < oleSize) return new ColGroupRLE(colIndexes, rlen, ubm); else return new ColGroupOLE(colIndexes, rlen, ubm); }
From source file:org.kuali.rice.krms.framework.engine.TermResolutionEngineImpl.java
/** * * @param termName// w ww. ja v a 2 s . co m * @return List<{@link TermResolverKey}> */ protected List<TermResolverKey> buildTermResolutionPlan(String termName) { // our result List<TermResolverKey> resolutionPlan = null; // Holds the resolvers we've visited, along with the needed metadata for generating our final plan Map<TermResolverKey, Visited> visitedByKey = new HashMap<TermResolverKey, Visited>(); // this holds a least cost first list of nodes remaining to be explored PriorityQueue<ToVisit> toVisits = new PriorityQueue<ToVisit>(); // nice grammar there cowboy // dummy resolver to be the root of this tree // Do I really need this? Yes, because there may be more than one resolver that resolves to the desired termName, // so this destination unifies the trees of those candidate resolvers TermResolver destination = createDestination(termName); // problem is we can't get this one out of the registry TermResolverKey destinationKey = new TermResolverKey(destination); LOG.debug("Beginning resolution tree search for " + termName); // seed our queue of resolvers to visit // need to be aware of null parent for root ToVisit toVisits.add(new ToVisit(0, destination, null)); // there may not be a viable plan boolean plannedToDestination = false; // We'll do a modified Dijkstra's shortest path algorithm, where at each leaf we see if we've planned out // termName resolution all the way up to the root, our destination. If so, we just reconstruct our plan. while (!plannedToDestination && toVisits.size() > 0) { // visit least cost node remaining ToVisit visiting = toVisits.poll(); LOG.debug("visiting " + visiting.getTermResolverKey()); // the resolver is the edge in our tree -- we don't get it directly from the termResolversByKey Map, because it could be our destination TermResolver resolver = getResolver(visiting.getTermResolverKey(), destination, destinationKey); TermResolver parent = getResolver(visiting.getParentKey(), destination, destinationKey); if (visitedByKey.containsKey(visiting.getTermResolverKey())) { continue; // We've already visited this one } Visited parentVisited = visitedByKey.get(visiting.getParentKey()); if (resolver == null) throw new RuntimeException("Unable to get TermResolver by its key"); Set<String> prereqs = resolver.getPrerequisites(); // keep track of any prereqs that we already have handy List<String> metPrereqs = new LinkedList<String>(); // see what prereqs we have already, and which we'll need to visit if (prereqs != null) for (String prereq : prereqs) { if (!termCache.containsKey(new Term(prereq, null))) { // enqueue all resolvers in toVisits List<TermResolver<?>> prereqResolvers = termResolversByOutput.get(prereq); if (prereqResolvers != null) for (TermResolver prereqResolver : prereqResolvers) { // Only TermResolvers that don't take paramaterized terms can be chained, so: // if the TermResolver doesn't take parameters, or it resolves the output termName if (CollectionUtils.isEmpty(prereqResolver.getParameterNames()) || termName.equals(prereqResolver.getOutput())) { // queue it up for visiting toVisits.add(new ToVisit(visiting.getCost() /* cost to get to this resolver */, prereqResolver, resolver)); } } } else { metPrereqs.add(prereq); } } // Build visited info Visited visited = buildVisited(resolver, parentVisited, metPrereqs); visitedByKey.put(visited.getResolverKey(), visited); plannedToDestination = isPlannedBackToDestination(visited, destinationKey, visitedByKey); } if (plannedToDestination) { // build result from Visited tree. resolutionPlan = new LinkedList<TermResolverKey>(); assembleLinearResolutionPlan(visitedByKey.get(destinationKey), visitedByKey, resolutionPlan); } return resolutionPlan; }
From source file:blusunrize.immersiveengineering.api.energy.wires.ImmersiveNetHandler.java
public Set<AbstractConnection> getIndirectEnergyConnections(BlockPos node, World world, boolean ignoreIsEnergyOutput) { int dimension = world.provider.getDimension(); if (!ignoreIsEnergyOutput && indirectConnections.containsKey(dimension) && indirectConnections.get(dimension).containsKey(node)) return indirectConnections.get(dimension).get(node); else if (ignoreIsEnergyOutput && indirectConnectionsIgnoreOut.containsKey(dimension) && indirectConnectionsIgnoreOut.get(dimension).containsKey(node)) return indirectConnectionsIgnoreOut.get(dimension).get(node); PriorityQueue<Pair<IImmersiveConnectable, Float>> queue = new PriorityQueue<>( Comparator.comparingDouble(Pair::getRight)); Set<AbstractConnection> closedList = newSetFromMap(new ConcurrentHashMap<AbstractConnection, Boolean>()); List<BlockPos> checked = new ArrayList<>(); HashMap<BlockPos, BlockPos> backtracker = new HashMap<>(); checked.add(node);// ww w . ja va 2 s .c o m Set<Connection> conL = getConnections(world, node); if (conL != null) for (Connection con : conL) { IImmersiveConnectable end = toIIC(con.end, world); if (end != null) { queue.add(new ImmutablePair<>(end, con.getBaseLoss())); backtracker.put(con.end, node); } } IImmersiveConnectable next; final int closedListMax = 1200; while (closedList.size() < closedListMax && !queue.isEmpty()) { Pair<IImmersiveConnectable, Float> pair = queue.poll(); next = pair.getLeft(); float loss = pair.getRight(); BlockPos nextPos = toBlockPos(next); if (!checked.contains(nextPos) && queue.stream().noneMatch((p) -> p.getLeft().equals(nextPos))) { boolean isOutput = next.isEnergyOutput(); if (ignoreIsEnergyOutput || isOutput) { BlockPos last = toBlockPos(next); WireType minimumType = null; int distance = 0; List<Connection> connectionParts = new ArrayList<>(); while (last != null) { BlockPos prev = last; last = backtracker.get(last); if (last != null) { Set<Connection> conLB = getConnections(world, last); if (conLB != null) for (Connection conB : conLB) if (conB.end.equals(prev)) { connectionParts.add(0, conB); distance += conB.length; if (minimumType == null || conB.cableType.getTransferRate() < minimumType.getTransferRate()) minimumType = conB.cableType; break; } } } closedList.add(new AbstractConnection(toBlockPos(node), toBlockPos(next), minimumType, distance, isOutput, connectionParts.toArray(new Connection[connectionParts.size()]))); } Set<Connection> conLN = getConnections(world, toBlockPos(next)); if (conLN != null) for (Connection con : conLN) if (next.allowEnergyToPass(con)) { IImmersiveConnectable end = toIIC(con.end, world); Optional<Pair<IImmersiveConnectable, Float>> existing = queue.stream() .filter((p) -> p.getLeft() == end).findAny(); float newLoss = con.getBaseLoss() + loss; if (end != null && !checked.contains(con.end) && existing.map(Pair::getRight).orElse(Float.MAX_VALUE) > newLoss) { existing.ifPresent(p1 -> queue.removeIf((p2) -> p1.getLeft() == p2.getLeft())); queue.add(new ImmutablePair<>(end, newLoss)); backtracker.put(con.end, toBlockPos(next)); } } checked.add(toBlockPos(next)); } } if (FMLCommonHandler.instance().getEffectiveSide() == Side.SERVER) { if (ignoreIsEnergyOutput) { if (!indirectConnectionsIgnoreOut.containsKey(dimension)) indirectConnectionsIgnoreOut.put(dimension, new ConcurrentHashMap<>()); Map<BlockPos, Set<AbstractConnection>> conns = indirectConnectionsIgnoreOut.get(dimension); if (!conns.containsKey(node)) conns.put(node, newSetFromMap(new ConcurrentHashMap<>())); conns.get(node).addAll(closedList); } else { if (!indirectConnections.containsKey(dimension)) indirectConnections.put(dimension, new ConcurrentHashMap<>()); Map<BlockPos, Set<AbstractConnection>> conns = indirectConnections.get(dimension); if (!conns.containsKey(node)) conns.put(node, newSetFromMap(new ConcurrentHashMap<>())); conns.get(node).addAll(closedList); } } return closedList; }
From source file:edu.utsa.sifter.som.MainSOM.java
void initTerms() throws IOException { final Terms terms = MultiFields.getTerms(Reader, "body"); System.out.println("number of terms in index: " + terms.size()); final PriorityQueue<TermPair> topTerms = new PriorityQueue<TermPair>(Conf.MAX_VECTOR_FEATURES, new TermPair.TermPairComparator()); int num = 0;/*from w w w . j av a 2 s . c o m*/ TermsEnum term = terms.iterator(null); while (term.next() != null) { final int count = term.docFreq(); final double r = ((double) count) / Reader.numDocs(); if (Conf.DOC_FREQ_THRESHOLD_LOW <= r && r <= Conf.DOC_FREQ_THRESHOLD_HIGH) { final String s = term.term().utf8ToString(); if (s.length() >= Conf.MIN_SOM_TERM_LENGTH) { if (topTerms.size() < Conf.MAX_VECTOR_FEATURES) { topTerms.add(new TermPair(s, count)); } else if (topTerms.peek().DocCount < count) { topTerms.remove(); topTerms.add(new TermPair(s, count)); } ++num; } } } System.out.println(num + " terms with in doc frequency range"); final int numFeatures = Math.min(topTerms.size(), Conf.MAX_VECTOR_FEATURES); TermIndices = new HashMap<String, Integer>((numFeatures * 4 + 1) / 3); // respect load factor Terms = new java.util.Vector<String>(numFeatures); Terms.setSize(numFeatures); System.out.println("the top " + numFeatures + " features will be used"); for (int i = numFeatures - 1; i > -1; --i) { // reverse order, to put top terms first TermPair t = topTerms.poll(); // least remaining TermIndices.put(t.Term, i); Terms.set(i, t.Term); // System.out.println("Including term " + t.Term + " (" + t.DocCount + ")"); } }
From source file:org.mskcc.cbio.portal.servlet.NetworkServlet.java
/** * * @param network/*from w w w.ja va 2s . co m*/ * @param n * @return */ private List<Node> getNodesToRemove(final Network network, final double diffusion, final int n) { final Map<Node, Double> mapDiffusion = getMapDiffusedTotalAlteredPercentage(network, diffusion); // keep track of the top nKeep PriorityQueue<Node> topAlteredNodes = new PriorityQueue<Node>(n, new Comparator<Node>() { public int compare(Node n1, Node n2) { int ret = mapDiffusion.get(n1).compareTo(mapDiffusion.get(n2)); if (diffusion != 0 && ret == 0) { // if the same diffused perc, use own perc ret = Double.compare(getTotalAlteredPercentage(n1), getTotalAlteredPercentage(n2)); } if (ret == 0) { // if the same, rank according to degree ret = network.getDegree(n1) - network.getDegree(n2); } return ret; } }); List<Node> nodesToRemove = new ArrayList<Node>(); for (Node node : network.getNodes()) { if (isInQuery(node) || node.getType().equals(NodeType.DRUG)) { continue; } if (topAlteredNodes.size() < n) { topAlteredNodes.add(node); } else { if (n == 0) { nodesToRemove.add(node); } else { if (mapDiffusion.get(node) > mapDiffusion.get(topAlteredNodes.peek())) { nodesToRemove.add(topAlteredNodes.poll()); topAlteredNodes.add(node); } else { nodesToRemove.add(node); } } } } return nodesToRemove; }
From source file:$.HyperGraphBuilder$.java
@Override public V_GenericGraph makeGraphResponse(final V_GraphQuery graphQuery) throws Exception { nodeList = new HashMap<String, V_GenericNode>(); // edgeMap = new HashMap<String, V_GenericEdge>(); edgeList = new HashMap<String, V_GenericEdge>(); scannedQueries = new HashSet<String>(); final PriorityQueue<G_EntityQuery> queriesToRun = new PriorityQueue<G_EntityQuery>(10, new ScoreComparator()); Map<String, V_GenericNode> nodesFromPreviousDegree = new HashMap<String, V_GenericNode>(); Map<String, V_GenericEdge> edgesFromPreviousDegree = new HashMap<String, V_GenericEdge>(); if (graphQuery.getMaxHops() <= 0) { return new V_GenericGraph(); } else {// w w w . ja va 2 s .c o m logger.debug("Attempting a graph for query " + graphQuery.toString()); } int intStatus = 0; String strStatus = "Graph Loaded"; final G_PropertyMatchDescriptor identifierList = G_PropertyMatchDescriptor.newBuilder().setKey("_all") .setListRange(new ListRangeHelper(G_PropertyType.STRING, graphQuery.getSearchIds())) .setConstraint(G_Constraint.EQUALS).build(); final QueryHelper qh = new QueryHelper(identifierList); qh.setTargetSchema(index); queriesToRun.add(qh); int currentDegree = 0; for (currentDegree = 0; (currentDegree < graphQuery.getMaxHops()) && (nodeList.size() < graphQuery.getMaxNodes()); currentDegree++) { G_EntityQuery eq = null; logger.debug("${symbol_dollar}${symbol_dollar}${symbol_dollar}${symbol_dollar}There are " + queriesToRun.size() + " queries to run in the current degree."); while ((queriesToRun.size() > 0) && ((eq = queriesToRun.poll()) != null) && (nodeList.size() < graphQuery.getMaxNodes())) { if (ValidationUtils.isValid(eq.getPropertyMatchDescriptors())) { nodesFromPreviousDegree = new HashMap<String, V_GenericNode>(nodeList); edgesFromPreviousDegree = new HashMap<String, V_GenericEdge>(edgeList); logger.debug("Processing degree " + currentDegree); /** * This will end up building nodes and edges, and creating * new queries for the queue */ logger.debug("1111=====Running query " + eq.toString()); getDAO().performCallback(0, eq.getMaxResult(), this, eq); logger.debug("3333====After running " + eq.toString() + ", there are " + queriesToRunNextDegree.size() + " queries to run in the next degree."); } } // end while loop // very important!! // unscannedNodeList.clear(); // //////////////////////////////////////////////// logger.debug("4444==== At the end of degree " + currentDegree + ", there are " + nodeList.size() + " nodes and " + edgeList.size() + " edges"); logger.debug( "5555====There are " + queriesToRunNextDegree.size() + " queries to run in the next degree."); queriesToRun.addAll(queriesToRunNextDegree); queriesToRunNextDegree.clear(); } // All hops have been done // Check to see if we have too many nodes. if (nodeList.size() > graphQuery.getMaxNodes()) { nodeList = nodesFromPreviousDegree; edgeList = edgesFromPreviousDegree; intStatus = 1; // will trigger the message. strStatus = "Returning only " + currentDegree + " hops, as maximum nodes you requested would be exceeded"; } else { intStatus = 1; // will trigger the message. strStatus = "Returning " + nodeList.size() + " nodes and " + edgeList.size() + " edges."; } // NOW finally add in all those unique edges. performPostProcess(graphQuery); final V_GenericGraph g = new V_GenericGraph(nodeList, edgeList); g.setIntStatus(intStatus); g.setStrStatus(strStatus); logger.debug("Graph status: " + g.getStrStatus()); for (final V_LegendItem li : legendItems) { g.addLegendItem(li); } return g; }
From source file:beast.evolution.tree.ConstrainedClusterTree.java
/** * Perform clustering using a link method * This implementation uses a priority queue resulting in a O(n^2 log(n)) algorithm * * @param nClusters number of clusters * @param nClusterID/*from ww w .j a v a 2 s. com*/ * @param clusterNodes */ void doLinkClustering(int nClusters, final List<Integer>[] nClusterID, final NodeX[] clusterNodes) { Log.warning.print("Calculating distance"); final int nInstances = taxaNames.size(); final PriorityQueue<Tuple> queue = new PriorityQueue<Tuple>(nClusters * nClusters / 2, new TupleComparator()); final double[][] fDistance0 = new double[nClusters][nClusters]; for (int i = 0; i < nClusters; i++) { fDistance0[i][i] = 0; for (int j = i + 1; j < nClusters; j++) { fDistance0[i][j] = getDistance0(nClusterID[i], nClusterID[j]); fDistance0[j][i] = fDistance0[i][j]; if (isCompatible(i, j, nClusterID)) { queue.add(new Tuple(fDistance0[i][j], i, j, 1, 1)); } } // feedback on progress if ((i + 1) % 100 == 0) { if ((i + 1) % 1000 == 0) { Log.warning.print('|'); } else { Log.warning.print('.'); } } } Log.warning.print("\nClustering: "); while (nClusters > 1) { int iMin1 = -1; int iMin2 = -1; // use priority queue to find next best pair to cluster Tuple t; do { t = queue.poll(); } while (t != null && (nClusterID[t.m_iCluster1].size() != t.m_nClusterSize1 || nClusterID[t.m_iCluster2].size() != t.m_nClusterSize2)); iMin1 = t.m_iCluster1; iMin2 = t.m_iCluster2; merge(iMin1, iMin2, t.m_fDist / 2.0, t.m_fDist / 2.0, nClusterID, clusterNodes); updateConstraints(nClusterID[iMin1]); // merge clusters // update distances & queue for (int i = 0; i < nInstances; i++) { if (i != iMin1 && nClusterID[i].size() != 0) { final int i1 = Math.min(iMin1, i); final int i2 = Math.max(iMin1, i); if (isCompatible(i1, i2, nClusterID)) { final double fDistance = getDistance(fDistance0, nClusterID[i1], nClusterID[i2]); queue.add(new Tuple(fDistance, i1, i2, nClusterID[i1].size(), nClusterID[i2].size())); } } } nClusters--; // feedback on progress if (nClusters % 100 == 0) { if (nClusters % 1000 == 0) { Log.warning.print('|'); } else { Log.warning.print('.'); } } } Log.warning.println(" done."); }
From source file:org.broad.igv.track.PackedFeatures.java
/** * Allocates each feature to the rows such that there is no overlap. * * @param iter TabixLineReader wrapping the collection of alignments. Note that this should * really be an Iterator<T>, but it can't be subclassed if that's the case. *///from ww w . j a v a2 s. c o m List<FeatureRow> packFeatures(Iterator iter) { List<FeatureRow> rows = new ArrayList(10); if (iter == null || !iter.hasNext()) { return rows; } maxFeatureLength = 0; int totalCount = 0; LinkedHashMap<Integer, PriorityQueue<T>> bucketArray = new LinkedHashMap(); Comparator pqComparator = new Comparator<T>() { public int compare(Feature row1, Feature row2) { return (row2.getEnd() - row2.getStart()) - (row1.getEnd() - row2.getStart()); } }; // Allocate features to buckets, 1 bucket per base position while (iter.hasNext()) { T feature = (T) iter.next(); maxFeatureLength = Math.max(maxFeatureLength, getFeatureEndForPacking(feature) - getFeatureStartForPacking(feature)); features.add(feature); int bucketNumber = getFeatureStartForPacking(feature); PriorityQueue<T> bucket = bucketArray.get(bucketNumber); if (bucket == null) { bucket = new PriorityQueue<T>(5, pqComparator); bucketArray.put(bucketNumber, bucket); } bucket.add(feature); totalCount++; } // Allocate features to rows, pulling at most 1 per bucket for each row FeatureRow currentRow = new FeatureRow(); int allocatedCount = 0; int nextStart = Integer.MIN_VALUE; int lastKey = 0; int lastAllocatedCount = -1; while (allocatedCount < totalCount && rows.size() < maxLevels) { // Check to prevent infinite loops if (lastAllocatedCount == allocatedCount) { if (IGV.hasInstance()) { String msg = "Infinite loop detected while packing features for track: " + getTrackName() + ".<br>Not all features will be shown." + "<br>Please contact igv-team@broadinstitute.org"; log.error(msg); MessageUtils.showMessage(msg); } break; } lastAllocatedCount = allocatedCount; // Next row Loop through alignments until we reach the end of the interval PriorityQueue<T> bucket = null; // Advance to nextLine occupied bucket ArrayList<Integer> emptyBucketKeys = new ArrayList(); for (Integer key : bucketArray.keySet()) { //if (key < lastKey) { // String msg = "Features from track: " + trackName + " are not sorted. Some features might not be shown.<br>" + // "Please notify igv-help@broadinstitute.org"; // MessageUtils.showMessage(msg); //} lastKey = key; if (key >= nextStart) { bucket = bucketArray.get(key); T feature = bucket.poll(); if (bucket.isEmpty()) { emptyBucketKeys.add(key); } currentRow.addFeature(feature); nextStart = currentRow.end + FeatureTrack.MINIMUM_FEATURE_SPACING; allocatedCount++; } } for (Integer key : emptyBucketKeys) { bucketArray.remove(key); } // We've reached the end of the interval, start a new row if (currentRow.features.size() > 0) { rows.add(currentRow); lastAllocatedCount = 0; } currentRow = new FeatureRow(); nextStart = 0; lastKey = 0; } // Add the last row if (currentRow.features.size() > 0) { rows.add(currentRow); } return rows; }
From source file:com.joliciel.talismane.tokeniser.patterns.CompoundPatternTokeniser.java
@Override public List<TokenisedAtomicTokenSequence> tokeniseWithDecisions(Sentence sentence) { MONITOR.startTask("tokeniseWithDecisions"); try {/*from w w w. j av a 2 s . c o m*/ // apply any pre-tokenisation decisions via filters // we only want one placeholder per start index - the first one that gets added Map<Integer, TokenPlaceholder> placeholderMap = new HashMap<Integer, TokenPlaceholder>(); for (TokenFilter tokenFilter : this.tokenFilters) { Set<TokenPlaceholder> myPlaceholders = tokenFilter.apply(sentence.getText()); for (TokenPlaceholder placeholder : myPlaceholders) { if (!placeholderMap.containsKey(placeholder.getStartIndex())) { placeholderMap.put(placeholder.getStartIndex(), placeholder); } } if (LOG.isTraceEnabled()) { if (myPlaceholders.size() > 0) { LOG.trace("TokenFilter: " + tokenFilter); LOG.trace("placeholders: " + myPlaceholders); } } } Set<TokenPlaceholder> placeholders = new HashSet<TokenPlaceholder>(placeholderMap.values()); // Initially, separate the sentence into tokens using the separators provided TokenSequence tokenSequence = this.tokeniserService.getTokenSequence(sentence, Tokeniser.SEPARATORS, placeholders); // apply any pre-processing filters that have been added for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) { tokenSequenceFilter.apply(tokenSequence); } // Assign each separator its default value List<TokeniserOutcome> defaultOutcomes = this.tokeniserPatternManager.getDefaultOutcomes(tokenSequence); List<Decision<TokeniserOutcome>> defaultDecisions = new ArrayList<Decision<TokeniserOutcome>>( defaultOutcomes.size()); for (TokeniserOutcome outcome : defaultOutcomes) { Decision<TokeniserOutcome> tokeniserDecision = this.tokeniserDecisionFactory .createDefaultDecision(outcome); tokeniserDecision.addAuthority("_" + this.getClass().getSimpleName()); tokeniserDecision.addAuthority("_" + "DefaultDecision"); defaultDecisions.add(tokeniserDecision); } List<TokenisedAtomicTokenSequence> sequences = null; // For each test pattern, see if anything in the sentence matches it if (this.decisionMaker != null) { List<TokenPatternMatchSequence> matchingSequences = new ArrayList<TokenPatternMatchSequence>(); Map<Token, Set<TokenPatternMatchSequence>> tokenMatchSequenceMap = new HashMap<Token, Set<TokenPatternMatchSequence>>(); Map<TokenPatternMatchSequence, TokenPatternMatch> primaryMatchMap = new HashMap<TokenPatternMatchSequence, TokenPatternMatch>(); Set<Token> matchedTokens = new HashSet<Token>(); MONITOR.startTask("pattern matching"); try { for (TokenPattern parsedPattern : this.getTokeniserPatternManager().getParsedTestPatterns()) { List<TokenPatternMatchSequence> matchesForThisPattern = parsedPattern.match(tokenSequence); for (TokenPatternMatchSequence matchSequence : matchesForThisPattern) { matchingSequences.add(matchSequence); matchedTokens.addAll(matchSequence.getTokensToCheck()); TokenPatternMatch primaryMatch = null; Token token = matchSequence.getTokensToCheck().get(0); Set<TokenPatternMatchSequence> matchSequences = tokenMatchSequenceMap.get(token); if (matchSequences == null) { matchSequences = new TreeSet<TokenPatternMatchSequence>(); tokenMatchSequenceMap.put(token, matchSequences); } matchSequences.add(matchSequence); for (TokenPatternMatch patternMatch : matchSequence.getTokenPatternMatches()) { if (patternMatch.getToken().equals(token)) { primaryMatch = patternMatch; break; } } if (LOG.isTraceEnabled()) { LOG.trace("Found match: " + primaryMatch); } primaryMatchMap.put(matchSequence, primaryMatch); } } } finally { MONITOR.endTask("pattern matching"); } // we want to create the n most likely token sequences // the sequence has to correspond to a token pattern Map<TokenPatternMatchSequence, List<Decision<TokeniserOutcome>>> matchSequenceDecisionMap = new HashMap<TokenPatternMatchSequence, List<Decision<TokeniserOutcome>>>(); for (TokenPatternMatchSequence matchSequence : matchingSequences) { TokenPatternMatch match = primaryMatchMap.get(matchSequence); LOG.debug("next pattern match: " + match.toString()); List<FeatureResult<?>> tokenFeatureResults = new ArrayList<FeatureResult<?>>(); MONITOR.startTask("analyse features"); try { for (TokenPatternMatchFeature<?> feature : features) { RuntimeEnvironment env = this.featureService.getRuntimeEnvironment(); FeatureResult<?> featureResult = feature.check(match, env); if (featureResult != null) { tokenFeatureResults.add(featureResult); } } if (LOG.isTraceEnabled()) { for (FeatureResult<?> featureResult : tokenFeatureResults) { LOG.trace(featureResult.toString()); } } } finally { MONITOR.endTask("analyse features"); } List<Decision<TokeniserOutcome>> decisions = null; MONITOR.startTask("make decision"); try { decisions = this.decisionMaker.decide(tokenFeatureResults); for (ClassificationObserver<TokeniserOutcome> observer : this.observers) observer.onAnalyse(match.getToken(), tokenFeatureResults, decisions); for (Decision<TokeniserOutcome> decision : decisions) { decision.addAuthority("_" + this.getClass().getSimpleName()); decision.addAuthority("_" + "Patterns"); decision.addAuthority(match.getPattern().getName()); } } finally { MONITOR.endTask("make decision"); } matchSequenceDecisionMap.put(matchSequence, decisions); } // initially create a heap with a single, empty sequence PriorityQueue<TokenisedAtomicTokenSequence> heap = new PriorityQueue<TokenisedAtomicTokenSequence>(); TokenisedAtomicTokenSequence emptySequence = this.getTokeniserService() .getTokenisedAtomicTokenSequence(sentence, 0); heap.add(emptySequence); for (int i = 0; i < tokenSequence.listWithWhiteSpace().size(); i++) { Token token = tokenSequence.listWithWhiteSpace().get(i); if (LOG.isTraceEnabled()) { LOG.trace("Token : \"" + token.getText() + "\""); } // build a new heap for this iteration PriorityQueue<TokenisedAtomicTokenSequence> previousHeap = heap; heap = new PriorityQueue<TokenisedAtomicTokenSequence>(); if (i == 0) { // first token is always "separate" from the outside world Decision<TokeniserOutcome> decision = this.tokeniserDecisionFactory .createDefaultDecision(TokeniserOutcome.SEPARATE); decision.addAuthority("_" + this.getClass().getSimpleName()); decision.addAuthority("_" + "DefaultDecision"); TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService.getTaggedToken(token, decision); TokenisedAtomicTokenSequence newSequence = this.getTokeniserService() .getTokenisedAtomicTokenSequence(emptySequence); newSequence.add(taggedToken); heap.add(newSequence); continue; } // limit the heap breadth to K int maxSequences = previousHeap.size() > this.getBeamWidth() ? this.getBeamWidth() : previousHeap.size(); MONITOR.startTask("heap sort"); try { for (int j = 0; j < maxSequences; j++) { TokenisedAtomicTokenSequence history = previousHeap.poll(); // Find the separating & non-separating decisions if (history.size() > i) { // token already added as part of a sequence introduced by another token heap.add(history); } else if (tokenMatchSequenceMap.containsKey(token)) { // token begins one or more match sequences // these are ordered from shortest to longest (via TreeSet) List<TokenPatternMatchSequence> matchSequences = new ArrayList<TokenPatternMatchSequence>( tokenMatchSequenceMap.get(token)); // Since sequences P1..Pn contain each other, // there can be exactly matchSequences.size() consistent solutions // Assume the default is separate // 0: all separate // 1: join P1, separate rest // 2: join P2, separate rest // ... // n: join Pn // We need to add each of these to the heap // by taking the product of all probabilities consistent with each solution // The probabities for each solution are (j=join, s=separate) // All separate: s1 x s2 x ... x sn // P1: j1 x s2 x ... x sn // P2: j1 x j2 x ... x sn // ... // Pn: j1 x j2 x ... x jn // Any solution of the form s1 x j2 would be inconsistent, and is not considered // If Pi and Pj start and end on the exact same token, then the solution for both is // Pi: j1 x ... x ji x jj x sj+1 ... x sn // Pj: j1 x ... x ji x jj x sj+1 ... x sn // Note of course that we're never likely to have more than two Ps here, // but we need a solution for more just to be sure to be sure TokeniserOutcome defaultOutcome = defaultDecisions .get(token.getIndexWithWhiteSpace()).getOutcome(); TokeniserOutcome otherOutcome = null; if (defaultOutcome == TokeniserOutcome.SEPARATE) otherOutcome = TokeniserOutcome.JOIN; else otherOutcome = TokeniserOutcome.SEPARATE; double[] decisionProbs = new double[matchSequences.size() + 1]; for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] = 1; // Note: k0 = default decision (e.g. separate all), k1=first pattern // p1 = first pattern int p = 1; int prevEndIndex = -1; for (TokenPatternMatchSequence matchSequence : matchSequences) { int endIndex = matchSequence.getTokensToCheck() .get(matchSequence.getTokensToCheck().size() - 1).getEndIndex(); List<Decision<TokeniserOutcome>> decisions = matchSequenceDecisionMap .get(matchSequence); for (Decision<TokeniserOutcome> decision : decisions) { for (int k = 0; k < decisionProbs.length; k++) { if (decision.getOutcome() == defaultOutcome) { // e.g. separate in most cases if (k < p && endIndex > prevEndIndex) decisionProbs[k] *= decision.getProbability(); else if (k + 1 < p && endIndex <= prevEndIndex) decisionProbs[k] *= decision.getProbability(); } else { // e.g. join in most cases if (k >= p && endIndex > prevEndIndex) decisionProbs[k] *= decision.getProbability(); else if (k + 1 >= p && endIndex <= prevEndIndex) decisionProbs[k] *= decision.getProbability(); } } // next k } // next decision (only 2 of these) prevEndIndex = endIndex; p++; } // transform to probability distribution double sumProbs = 0; for (int k = 0; k < decisionProbs.length; k++) sumProbs += decisionProbs[k]; if (sumProbs > 0) for (int k = 0; k < decisionProbs.length; k++) decisionProbs[k] /= sumProbs; // Apply default decision // Since this is the default decision for all tokens in the sequence, we don't add the other tokens for now, // so as to allow them // to get examined one at a time, just in case one of them starts its own separate sequence Decision<TokeniserOutcome> defaultDecision = this.tokeniserDecisionFactory .createDecision(defaultOutcome.getCode(), decisionProbs[0]); defaultDecision.addAuthority("_" + this.getClass().getSimpleName()); defaultDecision.addAuthority("_" + "Patterns"); for (TokenPatternMatchSequence matchSequence : matchSequences) { defaultDecision.addAuthority(matchSequence.getTokenPattern().getName()); } TaggedToken<TokeniserOutcome> defaultTaggedToken = this.tokeniserService .getTaggedToken(token, defaultDecision); TokenisedAtomicTokenSequence defaultSequence = this.getTokeniserService() .getTokenisedAtomicTokenSequence(history); defaultSequence.add(defaultTaggedToken); defaultSequence.addDecision(defaultDecision); heap.add(defaultSequence); // Apply one non-default decision per match sequence for (int k = 0; k < matchSequences.size(); k++) { TokenPatternMatchSequence matchSequence = matchSequences.get(k); double prob = decisionProbs[k + 1]; Decision<TokeniserOutcome> decision = this.tokeniserDecisionFactory .createDecision(otherOutcome.getCode(), prob); decision.addAuthority("_" + this.getClass().getSimpleName()); decision.addAuthority("_" + "Patterns"); decision.addAuthority(matchSequence.getTokenPattern().getName()); TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService .getTaggedToken(token, decision); TokenisedAtomicTokenSequence newSequence = this.getTokeniserService() .getTokenisedAtomicTokenSequence(history); newSequence.add(taggedToken); newSequence.addDecision(decision); // The decision is NOT the default decision for all tokens in the sequence, add all other tokens // in this sequence to the solution for (Token tokenInSequence : matchSequence.getTokensToCheck()) { if (tokenInSequence.equals(token)) { continue; } Decision<TokeniserOutcome> decisionInSequence = this.tokeniserDecisionFactory .createDefaultDecision(decision.getOutcome()); decisionInSequence.addAuthority("_" + this.getClass().getSimpleName()); decisionInSequence.addAuthority("_" + "DecisionInSequence"); decisionInSequence.addAuthority("_" + "DecisionInSequence_non_default"); decisionInSequence.addAuthority("_" + "Patterns"); TaggedToken<TokeniserOutcome> taggedTokenInSequence = this.tokeniserService .getTaggedToken(tokenInSequence, decisionInSequence); newSequence.add(taggedTokenInSequence); } heap.add(newSequence); } // next sequence } else { // token doesn't start match sequence, and hasn't already been added to the current sequence Decision<TokeniserOutcome> decision = defaultDecisions.get(i); if (matchedTokens.contains(token)) { decision = this.tokeniserDecisionFactory .createDefaultDecision(decision.getOutcome()); decision.addAuthority("_" + this.getClass().getSimpleName()); decision.addAuthority("_" + "DecisionInSequence"); decision.addAuthority("_" + "DecisionInSequence_default"); decision.addAuthority("_" + "Patterns"); } TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService .getTaggedToken(token, decision); TokenisedAtomicTokenSequence newSequence = this.getTokeniserService() .getTokenisedAtomicTokenSequence(history); newSequence.add(taggedToken); heap.add(newSequence); } } // next sequence in the old heap } finally { MONITOR.endTask("heap sort"); } } // next token sequences = new ArrayList<TokenisedAtomicTokenSequence>(); int k = 0; while (!heap.isEmpty()) { sequences.add(heap.poll()); k++; if (k >= this.getBeamWidth()) break; } } else { sequences = new ArrayList<TokenisedAtomicTokenSequence>(); TokenisedAtomicTokenSequence defaultSequence = this.getTokeniserService() .getTokenisedAtomicTokenSequence(sentence, 0); int i = 0; for (Token token : tokenSequence.listWithWhiteSpace()) { TaggedToken<TokeniserOutcome> taggedToken = this.tokeniserService.getTaggedToken(token, defaultDecisions.get(i++)); defaultSequence.add(taggedToken); } sequences.add(defaultSequence); } // have decision maker? LOG.debug("####Final token sequences:"); int j = 1; for (TokenisedAtomicTokenSequence sequence : sequences) { TokenSequence newTokenSequence = sequence.inferTokenSequence(); if (LOG.isDebugEnabled()) { LOG.debug("Token sequence " + (j++) + ", score=" + df.format(sequence.getScore())); LOG.debug("Atomic sequence: " + sequence); LOG.debug("Resulting sequence: " + newTokenSequence); } // need to re-apply the pre-processing filters, because the tokens are all new // Question: why can't we conserve the initial tokens when they haven't changed at all? // Answer: because the tokenSequence and index in the sequence is referenced by the token. // Question: should we create a separate class, Token and TokenInSequence, // one with index & sequence access & one without? for (TokenSequenceFilter tokenSequenceFilter : this.tokenSequenceFilters) { tokenSequenceFilter.apply(newTokenSequence); } if (LOG.isDebugEnabled()) { LOG.debug("After filters: " + newTokenSequence); } } return sequences; } finally { MONITOR.endTask("tokeniseWithDecisions"); } }