edu.cornell.mannlib.ld4lindexing.UriDiscoverer.java Source code

Java tutorial

Introduction

Here is the source code for edu.cornell.mannlib.ld4lindexing.UriDiscoverer.java

Source

/* $This file is distributed under the terms of the license in /doc/license.txt$ */

package edu.cornell.mannlib.ld4lindexing;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import sun.misc.Signal;
import sun.misc.SignalHandler;
import edu.cornell.mannlib.ld4lindexing.documents.DocumentFactory.Type;
import edu.cornell.mannlib.ld4lindexing.triplestores.TripleStore;

/**
 * Produces a stream of URIs, along with the type of the document required.
 * 
 * You provide a list of query specs, which describe a document type and the
 * query used to obtain them.
 * 
 * TODO: should be interruptible by the SIGINT. It should just behave as if it
 * found no more URIs. (after updating the bookmark.)
 */
public class UriDiscoverer implements Iterable<DocumentKey>, SignalHandler {
    private static final Log log = LogFactory.getLog(UriDiscoverer.class);

    private final TripleStore ts;
    private final Bookmark bookmark;
    private final Report report;

    private int uriLimit = Integer.MAX_VALUE;
    private int batchSize = 1000;
    private List<QuerySpec> querySpecs = new ArrayList<>();

    private volatile boolean terminated;

    public UriDiscoverer(TripleStore ts, Bookmark bookmark, Report report) {
        this.ts = ts;
        this.bookmark = bookmark;
        this.report = report;
    }

    public void addQuerySpec(Type type, String query) {
        querySpecs.add(new QuerySpec(type, query));
    }

    public void setUriLimit(int uriLimit) {
        this.uriLimit = uriLimit;
    }

    /**
     * @return
     */
    public Iterable<DocumentKey> discover() {
        return this;
    }

    @Override
    public Iterator<DocumentKey> iterator() {
        return new InnerIterator(querySpecs, bookmark);
    }

    private static class QuerySpec {
        private final Type type;
        private final String query;

        public QuerySpec(Type type, String query) {
            this.type = type;
            this.query = query;
        }

        public Type getType() {
            return type;
        }

        public String getQuery() {
            return query;
        }

    }

    private class InnerIterator implements Iterator<DocumentKey> {

        private final List<QuerySpec> specs;
        private final Bookmark bookmark;
        private final List<String> uriBuffer = new ArrayList<>();
        private DocumentKey nextKey;

        public InnerIterator(List<QuerySpec> specs, Bookmark bookmark) {
            this.specs = new ArrayList<>(specs);
            this.bookmark = bookmark;
            getNextKey();
        }

        @Override
        public boolean hasNext() {
            return (bookmark.getOffset() < uriLimit) && (nextKey != null);
        }

        @Override
        public DocumentKey next() {
            if (hasNext()) {
                DocumentKey result = nextKey;
                bookmark.increment();
                getNextKey();
                return result;
            } else {
                throw new NoSuchElementException();
            }
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        private void getNextKey() {
            if (terminated) {
                handleTermination();
            }
            if (uriBuffer.isEmpty()) {
                replenishTheBuffer();
            }
            if (uriBuffer.isEmpty()) {
                advanceToNextQuerySpec();
            }
            if (uriBuffer.isEmpty()) {
                nextKey = null;
            } else {
                nextKey = new DocumentKey(specs.get(0).getType(), uriBuffer.remove(0));
            }
        }

        private void handleTermination() {
            throw new TerminatedException();
        }

        private void advanceToNextQuerySpec() {
            if (bookmark.getTypeIndex() < specs.size() - 1) {
                bookmark.nextType();
                replenishTheBuffer();
            }
        }

        private void replenishTheBuffer() {
            bookmark.persist();
            QuerySpec currentSpec = specs.get(bookmark.getTypeIndex());
            String query = String.format("%s OFFSET %d LIMIT %d", currentSpec.getQuery(), bookmark.getOffset(),
                    batchSize);
            findUris(query);
            report.progress(currentSpec.getType(), bookmark.getOffset(), uriBuffer.size());
        }

        private void findUris(String query) {
            QueryRunner qr = new QueryRunner(ts, query);
            for (Map<String, Object> row : qr.execute()) {
                log.debug("RESULTS: " + row);
                uriBuffer.add((String) row.get("uri"));
            }
        }
    }

    /**
     * What to do when a Control-C comes in?
     */
    @Override
    public void handle(Signal sig) {
        terminated = true;
    }

}