Java tutorial
/* $This file is distributed under the terms of the license in /doc/license.txt$ */ package edu.cornell.mannlib.ld4lindexing; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import sun.misc.Signal; import sun.misc.SignalHandler; import edu.cornell.mannlib.ld4lindexing.documents.DocumentFactory.Type; import edu.cornell.mannlib.ld4lindexing.triplestores.TripleStore; /** * Produces a stream of URIs, along with the type of the document required. * * You provide a list of query specs, which describe a document type and the * query used to obtain them. * * TODO: should be interruptible by the SIGINT. It should just behave as if it * found no more URIs. (after updating the bookmark.) */ public class UriDiscoverer implements Iterable<DocumentKey>, SignalHandler { private static final Log log = LogFactory.getLog(UriDiscoverer.class); private final TripleStore ts; private final Bookmark bookmark; private final Report report; private int uriLimit = Integer.MAX_VALUE; private int batchSize = 1000; private List<QuerySpec> querySpecs = new ArrayList<>(); private volatile boolean terminated; public UriDiscoverer(TripleStore ts, Bookmark bookmark, Report report) { this.ts = ts; this.bookmark = bookmark; this.report = report; } public void addQuerySpec(Type type, String query) { querySpecs.add(new QuerySpec(type, query)); } public void setUriLimit(int uriLimit) { this.uriLimit = uriLimit; } /** * @return */ public Iterable<DocumentKey> discover() { return this; } @Override public Iterator<DocumentKey> iterator() { return new InnerIterator(querySpecs, bookmark); } private static class QuerySpec { private final Type type; private final String query; public QuerySpec(Type type, String query) { this.type = type; this.query = query; } public Type getType() { return type; } public String getQuery() { return query; } } private class InnerIterator implements Iterator<DocumentKey> { private final List<QuerySpec> specs; private final Bookmark bookmark; private final List<String> uriBuffer = new ArrayList<>(); private DocumentKey nextKey; public InnerIterator(List<QuerySpec> specs, Bookmark bookmark) { this.specs = new ArrayList<>(specs); this.bookmark = bookmark; getNextKey(); } @Override public boolean hasNext() { return (bookmark.getOffset() < uriLimit) && (nextKey != null); } @Override public DocumentKey next() { if (hasNext()) { DocumentKey result = nextKey; bookmark.increment(); getNextKey(); return result; } else { throw new NoSuchElementException(); } } @Override public void remove() { throw new UnsupportedOperationException(); } private void getNextKey() { if (terminated) { handleTermination(); } if (uriBuffer.isEmpty()) { replenishTheBuffer(); } if (uriBuffer.isEmpty()) { advanceToNextQuerySpec(); } if (uriBuffer.isEmpty()) { nextKey = null; } else { nextKey = new DocumentKey(specs.get(0).getType(), uriBuffer.remove(0)); } } private void handleTermination() { throw new TerminatedException(); } private void advanceToNextQuerySpec() { if (bookmark.getTypeIndex() < specs.size() - 1) { bookmark.nextType(); replenishTheBuffer(); } } private void replenishTheBuffer() { bookmark.persist(); QuerySpec currentSpec = specs.get(bookmark.getTypeIndex()); String query = String.format("%s OFFSET %d LIMIT %d", currentSpec.getQuery(), bookmark.getOffset(), batchSize); findUris(query); report.progress(currentSpec.getType(), bookmark.getOffset(), uriBuffer.size()); } private void findUris(String query) { QueryRunner qr = new QueryRunner(ts, query); for (Map<String, Object> row : qr.execute()) { log.debug("RESULTS: " + row); uriBuffer.add((String) row.get("uri")); } } } /** * What to do when a Control-C comes in? */ @Override public void handle(Signal sig) { terminated = true; } }