edu.psu.citeseerx.corrections.CorrectAuthors.java Source code

Java tutorial

Introduction

Here is the source code for edu.psu.citeseerx.corrections.CorrectAuthors.java

Source

/*
 * Copyright 2007 Penn State University
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package edu.psu.citeseerx.corrections;

import java.util.Date;
import java.util.List;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.json.JSONException;

import edu.psu.citeseerx.dao2.logic.CSXDAO;
import edu.psu.citeseerx.dao2.logic.CiteClusterDAO;
import edu.psu.citeseerx.domain.Author;
import edu.psu.citeseerx.domain.Document;
import edu.psu.citeseerx.updates.UpdateManager;
import edu.psu.citeseerx.utility.SeerSoftTFIDF;

/**
 * Delete duplicate authors from the same paper if any duplicate is found.
 * This job search for duplicates authors in a paper and deletes the one
 * with less information.
 * 
 * An author is duplicated if there is more than one author record that shared
 * the same name (distance metrics is used to determine if two names are equal)
 * within the same document.
 * @author Juan Pablo Fernandez Ramirez
 * @version $Rev$ $Date$
 */
public class CorrectAuthors {

    protected final Log logger = LogFactory.getLog(getClass());

    private static final String CORRECTION_SOURCE = "user correction - " + "Author Dedup";
    private static final int MAX_RECORDS = 1000;

    private UpdateManager updateManager;

    public void setUpdateManager(UpdateManager updateManager) {
        this.updateManager = updateManager;
    } //- setUpdateManager

    private CSXDAO csxdao;

    public void setCSXDAO(CSXDAO csxdao) {
        this.csxdao = csxdao;
    } //- setCSXDAO

    private CiteClusterDAO citedao;

    public void setCiteClusterDAO(CiteClusterDAO citedao) {
        this.citedao = citedao;
    } //- setCiteClusterDAO

    private String modelFile;

    public void setModelFile(String modelFile) {
        this.modelFile = modelFile;
    } //- setModelFile

    private double similarityFactor = 0.8;

    public void setSimilarityFactor(double similarityFactor) {
        this.similarityFactor = similarityFactor;
    } //- setSimilarityFactor

    private SeerSoftTFIDF distanceMetric = null;

    /*
     * Informs if two authors have the same name
     */
    private boolean compareAuthors(Author a1, Author a2) {

        boolean result = false;
        double similarity = distanceMetric.score(a1.getDatum(Author.NAME_KEY), a2.getDatum(Author.NAME_KEY));

        if (similarity >= similarityFactor) {
            // They are equal or pretty similar then they are the same
            result = true;
        }

        return result;
    } //- compareAuthors

    private String authorKeys[] = { Author.NAME_KEY, Author.AFFIL_KEY, Author.ADDR_KEY, Author.EMAIL_KEY };

    /*
     * Returns the number of fields with a meaningful value. The value is 
     * neither null nor an empty string. 
     * 
     * Order is not included in the checked fields since it will have always a
     * value, and is not meaningful for the purpose of this Fixer. 
     */
    private int howManyFilledFields(Author author) {
        int filled = 0;

        String value;
        for (int j = 0; j < authorKeys.length; ++j) {
            value = author.getDatum(authorKeys[j]);
            if (null != value && value.trim().length() > 0) {
                filled++;
            }
        }
        return filled;
    } //- getNotNullFields

    /*
     * Sets the correction source for the author
     */
    private void setCorrectionSource(Author author) {
        for (int j = 0; j < authorKeys.length; ++j) {
            author.setSource(authorKeys[j], CORRECTION_SOURCE);
        }
    } //- setCorrectionSource

    /*
     * Authors a1 and a2 have the same name but probably are not equal from the
     * document stand point (a different order, for instance).
     * 
     * Returns:
     *  -1 if author1 has more data than author2
     *   0 if both author have the same amount of data
     *   1 if author2 has more data than author1
     */
    private int bestAuthor(Author a1, Author a2) {
        int result = 0;
        int filledFieldsA1 = howManyFilledFields(a1);
        int filledFieldsA2 = howManyFilledFields(a2);
        if (filledFieldsA2 > filledFieldsA1) {
            result = 1;
        } else if (filledFieldsA1 > filledFieldsA2) {
            result = -1;
        }
        return result;

    } //- bestAuthor

    /*
     * Eliminates duplicate authors from this document if any.
     * The author list is ordered by order of appearance. This means that 
     * duplicates could be in any place in the author list.
     */
    private boolean eliminateDuplicateAuthors(Document doc) {
        List<Author> authors = doc.getAuthors();

        if (null == authors || authors.size() <= 1) {
            // Nothing to do.
            return false;
        }

        boolean corrected = false;
        for (int i = authors.size() - 1; i >= 1; --i) {
            Author lastAuthor = authors.get(i);
            for (int j = i - 1; j >= 0; --j) {
                Author currentAuthor = authors.get(j);
                if (compareAuthors(lastAuthor, currentAuthor)) {
                    logger.info("Duplicate authors have been found: " + lastAuthor.getDatum(Author.NAME_KEY)
                            + " and " + currentAuthor.getDatum(Author.NAME_KEY));

                    // They are the same. Find out which one has more data.
                    if (-1 == bestAuthor(lastAuthor, currentAuthor)) {
                        authors.set(j, lastAuthor);
                    }

                    // No matter what. We always delete the last one.
                    authors.remove(i);
                    j = -1;
                    corrected = true;
                }
            }
        }
        if (corrected) {
            for (int i = 0; i < authors.size(); i++) {
                authors.get(i).setDatum(Author.ORD_KEY, Integer.toString(i + 1));
                authors.get(i).setSource(Author.ORD_KEY, CORRECTION_SOURCE);

                // update source of correction for the rest of the fields
                setCorrectionSource(authors.get(i));
            }

            // Send the correction to the database.
            logger.info("Sending correction of: " + doc.getDatum(Document.DOI_KEY));
            try {
                updateManager.doCorrection(doc, "SystemCorrections");
            } catch (IOException e) {
                logger.error("An error ocurred while saving correction for: " + doc.getDatum(Document.DOI_KEY), e);
            } catch (JSONException e) {
                logger.error("An error ocurred while saving correction for: " + doc.getDatum(Document.DOI_KEY), e);
            }
        }
        return corrected;
    } //- eliminateDuplicateAuthors

    /**
     * Search for new documents within the corpus, and delete duplicates authors
     * If a user correction is found, nothing is done.
     * @return The number of corrected documents
     */
    public int process() {

        int numDocuments = 0;
        int numUpdated = 0;
        String lastDOI = "0.0.0.0.0";
        distanceMetric = new SeerSoftTFIDF(similarityFactor);

        try {
            distanceMetric.loadModel(modelFile);
            Date start = citedao.getLastAuthorDedupTime();
            Date end = new Date(System.currentTimeMillis());

            List<String> dois = csxdao.getCrawledDOIs(start, end, lastDOI, MAX_RECORDS);
            boolean finished = dois.isEmpty();
            boolean corrected;
            while (!finished) {
                numDocuments++;
                for (String doi : dois) {
                    // get the document.
                    Document doc = csxdao.getDocumentFromDB(doi);
                    logger.info("Processing document: " + doi);
                    if (doc.getVersionName() != "USER") {
                        corrected = eliminateDuplicateAuthors(doc);
                    } else {
                        corrected = false;
                    }
                    logger.info("Document: " + doi + " has been processed");
                    lastDOI = doi;
                    if (corrected) {
                        numUpdated++;
                    }
                }
                if ((numDocuments % MAX_RECORDS) == 0) {
                    logger.info("Processed: " + numDocuments);
                }
                dois = csxdao.getCrawledDOIs(start, end, lastDOI, MAX_RECORDS);
                finished = dois.isEmpty();

            }
            logger.info("Processed: " + numDocuments);
            citedao.setLastAuthorDedupTime(end);
        } catch (IOException e) {
            logger.error("An error ocurred while loading the model file", e);
        }
        logger.info(numUpdated + " documents were updated");
        return numUpdated;
    } //- process

} //- class CorrectAuthors