edu.psu.citeseerx.updates.IndexUpdateManager.java Source code

Java tutorial

Introduction

Here is the source code for edu.psu.citeseerx.updates.IndexUpdateManager.java

Source

/*
 * Copyright 2007 Penn State University
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.psu.citeseerx.updates;

import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.sql.SQLException;

import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;

import com.google.common.base.CharMatcher;

import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
import org.apache.solr.common.SolrInputDocument;

import edu.psu.citeseerx.dao2.logic.CSXDAO;
import edu.psu.citeseerx.dao2.logic.CiteClusterDAO;
import edu.psu.citeseerx.domain.Author;
import edu.psu.citeseerx.domain.Document;
import edu.psu.citeseerx.domain.DocumentFileInfo;
import edu.psu.citeseerx.domain.DomainTransformer;
import edu.psu.citeseerx.domain.RepositoryService;
import edu.psu.citeseerx.domain.Keyword;
import edu.psu.citeseerx.domain.Tag;
import edu.psu.citeseerx.domain.ThinDoc;
import edu.psu.citeseerx.repository.DocumentUnavailableException;
import edu.psu.citeseerx.repository.RepositoryUtilities;
import edu.psu.citeseerx.utility.SafeText;

/**
 * Utilities for updating a Solr index to be consistent with the csx_citegraph
 * cluster table within the storage backend.  This class reads in cluster
 * records and, if the cluster is marked to have a corresponding document
 * record within the citeseerx papers table, with read in the paper record
 * as well to create XML in the Solr update format and send the XML to
 * a Solr server.
 * <br><br>
 * The IndexUpdateManager maintains a timestamp of the last update within
 * the csx_citegraph database so that only records modified since the last
 * update will be processed.
 * <br><br>
 * Deletions are handled by reading in records marked for deletion within
 * the csx_citegraph deletions table.
 *
 * @author Isaac Councill
 * @version $Rev$ $Date$
 */
public class IndexUpdateManager {

    protected final Log logger = LogFactory.getLog(getClass());
    private boolean redoAll;
    private SolrServer solrServer;
    private long lastIndexedCluster;
    private final int indexBatchSize = 1000;

    private URL solrUpdateUrl;

    public void setSolrURL(String solrUpdateUrl) throws MalformedURLException {
        int cpus = Runtime.getRuntime().availableProcessors();

        this.solrUpdateUrl = new URL(solrUpdateUrl);
        this.solrServer = new ConcurrentUpdateSolrServer(solrUpdateUrl, indexBatchSize, cpus);
    } //- setSolrURL

    private CSXDAO csxdao;

    private RepositoryService repositoryService;

    public void setCSXDAO(CSXDAO csxdao) {
        this.csxdao = csxdao;
    } //- setCSXDAO

    private CiteClusterDAO citedao;

    public RepositoryService getRepositoryService() {
        return repositoryService;
    }

    public void setRepositoryService(RepositoryService repositoryService) {
        this.repositoryService = repositoryService;
    }

    public void setCiteClusterDAO(CiteClusterDAO citedao) {
        this.citedao = citedao;
    } //- setCiteClusterDAO

    public void setredoAll(boolean redoAll) {
        this.redoAll = redoAll;
    }

    private ExecutorService threadPool;
    {
        int cpus = Runtime.getRuntime().availableProcessors();

        threadPool = Executors.newFixedThreadPool(cpus * 2);
    }

    /**
     * Updates the index only for records that have corresponding document
     * records (document files within the CiteSeerX corpus).  This re-indexes
     * everything - not indexUpdateTime is recorded.
     * @throws IOException
     * @throws SolrServerException
     */
    public void indexInCollection() throws IOException, SolrServerException {
        int counter = 0;
        lastIndexedCluster = 0;

        while (true) {
            List<ThinDoc> docs = new ArrayList<ThinDoc>();
            docs = citedao.getClustersInCollection(new Long(lastIndexedCluster), indexBatchSize);
            if (docs.isEmpty()) {
                break;
            }

            counter += indexClusters(docs);
            solrServer.commit();
            System.out.println(counter + " documents added");
        }

        threadPool.shutdown();
        solrServer.optimize();
    } //- indexInCollection

    /**
     * Indexes all cluster records modified since the last update time.
     * @throws SQLException
     * @throws IOException
     * @throws SolrServerException
     */
    public void indexAll() throws SQLException, IOException, SolrServerException {
        int counter = 0;
        Date currentTime = new Date(System.currentTimeMillis());
        Date lastUpdate;

        if (redoAll) {
            System.out.println("redo all document indexing...");
            lastUpdate = new Date((long) 0);
        } else {
            System.out.println("index new documents...");
            lastUpdate = citedao.getLastIndexTime();
        }

        lastIndexedCluster = 0;

        while (true) {
            System.out.println("lastIndexedCluster=" + lastIndexedCluster);
            List<ThinDoc> docs = new ArrayList<ThinDoc>();

            docs = citedao.getClustersSinceTime(lastUpdate, new Long(lastIndexedCluster), indexBatchSize);
            if (docs.isEmpty()) {
                break;
            }

            counter += indexClusters(docs);
            solrServer.commit();
            System.out.println(counter + " documents added");
        }

        System.out.println("deletion...");
        processDeletions(currentTime);

        citedao.setLastIndexTime(currentTime);

        threadPool.shutdown();
        System.out.println("optimize...");
        solrServer.optimize();
    } //- indexAll

    private int indexClusters(List<ThinDoc> docs) throws IOException, SolrServerException {
        ArrayList<Future> futures = new ArrayList<Future>();

        for (ThinDoc doc : docs) {
            Long clusterid = doc.getCluster();

            lastIndexedCluster = clusterid;
            futures.add(threadPool.submit(new TaskIndexCluster(doc, clusterid)));
        }

        try {
            for (Future f : futures) {
                f.get();
                System.out.print('.');
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
        } catch (ExecutionException e) {
            Thread.currentThread().interrupt();
        }

        System.out.println();

        return docs.size();
    }

    private class TaskIndexCluster implements Callable<Void> {
        private final ThinDoc doc;
        Long clusterid;

        public TaskIndexCluster(ThinDoc doc, Long clusterid) {
            this.doc = doc;
            this.clusterid = clusterid;
        }

        public Void call() throws Exception {
            SolrInputDocument solrDoc = buildSolrInputDocumentOfCluster(doc, clusterid);
            solrServer.add(solrDoc);
            return null;
        }
    }

    private SolrInputDocument buildSolrInputDocumentOfCluster(ThinDoc doc, Long clusterid) throws IOException {
        List<Long> cites = new ArrayList<Long>();
        List<Long> citedby = new ArrayList<Long>();

        cites = citedao.getCitedClusters(clusterid);
        citedby = citedao.getCitingClusters(clusterid);

        if (doc.getInCollection() == false) {
            // We don't have the full document. Index the citation
            return buildSolrInputDocument(doc, cites, citedby);
        }

        Document fullDoc = findFullDocument(clusterid);
        if (fullDoc == null) {
            // The full document it's not public. Index the citation
            return buildSolrInputDocument(doc, cites, citedby);
        }

        // Index the full document
        fullDoc.setClusterID(clusterid);
        fullDoc.setNcites(doc.getNcites());
        return buildSolrInputDocument(fullDoc, cites, citedby);
    }

    private Document findFullDocument(Long clusterid) {
        List<String> dois = citedao.getPaperIDs(clusterid);

        for (String doi : dois) {
            Document fullDoc = csxdao.getDocumentFromDB(doi, false, false);

            if (fullDoc != null && fullDoc.isPublic()) {
                return fullDoc;
            }
        }

        return null;
    }

    /**
     * Builds a record in Solr update syntax corresponding to the
     * supplied parameters, and adds it to the supplied element
     * @param doc
     * @param cites
     * @param citedby
     * @throws IOException
     */
    private SolrInputDocument buildSolrInputDocument(Document doc, List<Long> cites, List<Long> citedby)
            throws IOException {
        String id = doc.getClusterID().toString();
        String doi = doc.getDatum(Document.DOI_KEY, Document.ENCODED);
        String title = doc.getDatum(Document.TITLE_KEY, Document.ENCODED);
        String venue = doc.getDatum(Document.VENUE_KEY, Document.ENCODED);
        String year = doc.getDatum(Document.YEAR_KEY, Document.ENCODED);
        String abs = doc.getDatum(Document.ABSTRACT_KEY, Document.ENCODED);
        String text = getText(doc);
        long vtime = (doc.getVersionTime() != null) ? doc.getVersionTime().getTime() : 0;
        int ncites = doc.getNcites();
        int scites = doc.getSelfCites();

        List<Keyword> keys = doc.getKeywords();
        ArrayList<String> keywords = new ArrayList<String>();
        for (Keyword key : keys) {
            keywords.add(key.getDatum(Keyword.KEYWORD_KEY, Keyword.ENCODED));
        }

        List<Author> authors = doc.getAuthors();
        ArrayList<String> authorNames = new ArrayList<String>();
        for (Author author : authors) {
            String name = author.getDatum(Author.NAME_KEY, Author.ENCODED);
            if (name != null) {
                authorNames.add(name);
            }
        }

        List<String> authorNorms = buildAuthorNorms(authorNames);

        StringBuffer citesBuffer = new StringBuffer();
        for (Iterator<Long> cids = cites.iterator(); cids.hasNext();) {
            citesBuffer.append(cids.next());
            if (cids.hasNext()) {
                citesBuffer.append(" ");
            }
        }

        StringBuffer citedbyBuffer = new StringBuffer();
        for (Iterator<Long> cids = citedby.iterator(); cids.hasNext();) {
            citedbyBuffer.append(cids.next());
            if (cids.hasNext()) {
                citedbyBuffer.append(" ");
            }
        }

        SolrInputDocument solrDoc = new SolrInputDocument();

        solrDoc.addField("id", id);
        if (doi != null) {
            solrDoc.addField("doi", doi);
            solrDoc.addField("incol", "1");
        } else {
            solrDoc.addField("incol", "0");
        }

        if (title != null) {
            solrDoc.addField("title", title);
        }

        if (venue != null) {
            solrDoc.addField("venue", venue);
        }

        if (abs != null) {
            solrDoc.addField("abstract", abs);
        }

        solrDoc.addField("ncites", Integer.toString(ncites));
        solrDoc.addField("scites", Integer.toString(scites));

        try {
            int year_i = Integer.parseInt(year);
            solrDoc.addField("year", Integer.toString(year_i));
        } catch (Exception e) {
        }

        for (String keyword : keywords) {
            solrDoc.addField("keyword", keyword);
        }

        for (String name : authorNames) {
            solrDoc.addField("author", name);
        }

        for (String norm : authorNorms) {
            solrDoc.addField("authorNorms", norm);
        }

        if (text != null) {
            solrDoc.addField("text", text);
        }

        solrDoc.addField("cites", citesBuffer.toString());
        solrDoc.addField("citedby", citedbyBuffer.toString());
        solrDoc.addField("vtime", Long.toString(vtime));

        return solrDoc;
    } //- buildSolrInputDocument

    /**
     * Translates the supplied ThinDoc to a Document object and passes
     * control the the Document-based buildSolrInputDocument method.
     * @param thinDoc
     * @param cites
     * @param citedby
     * @throws IOException
     */
    private SolrInputDocument buildSolrInputDocument(ThinDoc thinDoc, List<Long> cites, List<Long> citedby)
            throws IOException {
        Document doc = DomainTransformer.toDocument(thinDoc);
        return buildSolrInputDocument(doc, cites, citedby);
    } //- buildSolrInputDocument

    /**
     * Builds a list of author normalizations to create more flexible
     * author search.
     * @param names
     * @return
     */
    private static List<String> buildAuthorNorms(List<String> names) {
        HashSet<String> norms = new HashSet<String>();
        for (String name : names) {
            name = name.replaceAll("[^\\p{L} ]", "");
            StringTokenizer st = new StringTokenizer(name);
            String[] tokens = new String[st.countTokens()];
            int counter = 0;
            while (st.hasMoreTokens()) {
                tokens[counter] = st.nextToken();
                counter++;
            }
            norms.add(joinStringArray(tokens));

            if (tokens.length > 2) {

                String[] n1 = new String[tokens.length];
                for (int i = 0; i < tokens.length; i++) {
                    if (i < tokens.length - 1) {
                        n1[i] = Character.toString(tokens[i].charAt(0));
                    } else {
                        n1[i] = tokens[i];
                    }
                }

                String[] n2 = new String[tokens.length];
                for (int i = 0; i < tokens.length; i++) {
                    if (i > 0 && i < tokens.length - 1) {
                        n2[i] = Character.toString(tokens[i].charAt(0));
                    } else {
                        n2[i] = tokens[i];
                    }
                }

                norms.add(joinStringArray(n1));
                norms.add(joinStringArray(n2));
            }

            if (tokens.length > 1) {

                String[] n3 = new String[2];
                n3[0] = tokens[0];
                n3[1] = tokens[tokens.length - 1];

                String[] n4 = new String[2];
                n4[0] = Character.toString(tokens[0].charAt(0));
                n4[1] = tokens[tokens.length - 1];

                norms.add(joinStringArray(n3));
                norms.add(joinStringArray(n4));
            }
        }

        ArrayList<String> normList = new ArrayList<String>();
        for (Iterator<String> it = norms.iterator(); it.hasNext();) {
            normList.add(it.next());
        }

        return normList;
    } //- buildAuthorNorms

    private static String joinStringArray(String[] strings) {
        StringBuffer buffer = new StringBuffer();
        for (int i = 0; i < strings.length; i++) {
            buffer.append(strings[i]);
            if (i < strings.length - 1) {
                buffer.append(" ");
            }
        }

        return buffer.toString();
    } //- joinStringArray

    /**
     * Fetches the full text of a document from the filesystem repository.
     * @param doc
     * @return
     * @throws IOException
     */
    private String getText(Document doc) throws IOException {
        String doi = doc.getDatum(Document.DOI_KEY);
        if (doi == null) {
            return null;
        }
        try {
            String fileContent = new String();
            try {
                fileContent = RepositoryUtilities.getDocumentText(repositoryService, doi, true);
            } catch (DocumentUnavailableException e) {
                try {
                    fileContent = RepositoryUtilities.getDocumentText(repositoryService, doi, false);
                } catch (DocumentUnavailableException due) {
                }
            }
            return fileContent;

        } catch (Exception e) {
            throw (e);
        }
    } //- getText

    private void processDeletions(Date currentTime) throws IOException, SolrServerException {
        List<Long> list = citedao.getDeletions(currentTime);
        for (Long id : list) {
            solrServer.deleteById(id.toString());
        }

        solrServer.commit();
        citedao.removeDeletions(currentTime);
    } //- processDeletions

    /*
    public static void main(String[] args) throws Exception {
        
    DataSource dataSource = DBCPFactory.createDataSource("citeseerx");
    CSXDAO csxdao = new CSXDAO();
    csxdao.setDataSource(dataSource);
        
    DataSource cgDataSource = DBCPFactory.createDataSource("citegraph");
    CiteClusterDAO citedao = new CiteClusterDAOImpl();
    citedao.setDataSource(cgDataSource);
        
    DataSource cmDataSource = DBCPFactory.createDataSource("citemaster");
    CiteClusterDAO citemaster = new CiteClusterDAOImpl();
    citemaster.setDataSource(cmDataSource);
        
    IndexUpdateManager manager = new IndexUpdateManager();
    manager.setSolrURL("http://130.203.133.38:8983/solr/update");
    manager.setCSXDAO(csxdao);
    manager.setCiteClusterDAO(citemaster);
    manager.setCiteMaster(citemaster);
    manager.indexAll();
    }
    */
} //- class IndexUpdateManager