de.tudarmstadt.ukp.dkpro.core.io.jwpl.WikipediaRevisionPairReader.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.dkpro.core.io.jwpl.WikipediaRevisionPairReader.java

Source

/*******************************************************************************
 * Copyright 2010
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.dkpro.core.io.jwpl;

import java.io.IOException;
import java.sql.Timestamp;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;

import de.tudarmstadt.ukp.dkpro.core.io.jwpl.util.WikiUtils;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;
import de.tudarmstadt.ukp.wikipedia.parser.ParsedPage;
import de.tudarmstadt.ukp.wikipedia.revisionmachine.api.Revision;

/**
 * Reads pairs of adjacent revisions of all articles.
 *
 * @author zesch
 *
 */
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.io.jwpl.type.DBConfig",
        "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
public class WikipediaRevisionPairReader extends WikipediaRevisionReaderBase {

    public static final String REVISION_1 = "Revision1";
    public static final String REVISION_2 = "Revision2";

    /**
     * Restrict revision pairs to cases where the length of the revisions differ more than this
     * value (counted in characters).
     * */
    public static final String PARAM_MIN_CHANGE = "MinChange";
    @ConfigurationParameter(name = PARAM_MIN_CHANGE, mandatory = true, defaultValue = "0")
    private int minChange;

    /**
     * Restrict revision pairs to cases where the length of the revisions does not differ more than
     * this value (counted in characters).
     * */
    public static final String PARAM_MAX_CHANGE = "MaxChange";
    @ConfigurationParameter(name = PARAM_MAX_CHANGE, mandatory = true, defaultValue = "10000")
    private int maxChange;

    /** The number of revision pairs that should be skipped in the beginning. */
    public static final String PARAM_SKIP_FIRST_N_PAIRS = "SkipFirstNPairs";
    @ConfigurationParameter(name = PARAM_SKIP_FIRST_N_PAIRS, mandatory = false)
    protected int skipFirstNPairs;

    private Timestamp savedTimestamp;

    private int nrOfRevisionsProcessed;

    @Override
    public void initialize(UimaContext context) throws ResourceInitializationException {
        if (revisionIdFile != null || revisionIdParamArray != null) {
            this.getLogger().log(Level.WARNING,
                    "Reading a predefined list of revisions is currently not supported by the WikipediaRevisionPairReader. Falling back to reading ALL revisions.");
            revisionIdFile = null;
            revisionIdParamArray = null;
            // TODO add support for reading a defined set of revisions (like the
            // WikipediaRevisionReader)
        }
        super.initialize(context);
        savedTimestamp = null;
        nrOfRevisionsProcessed = 0;
    }

    @Override
    public void getNext(JCas jcas) throws IOException, CollectionException {
        super.getNext(jcas);

        Timestamp currentTimestamp = timestampIter.next();

        if (currentTimestamp == null) {
            throw new CollectionException(new Throwable("Current timestamp is null. Upps ... should not happen."));
        }

        this.getLogger().log(Level.FINE, currentArticle.getPageId() + "-" + currentTimestamp);

        try {

            JCas revView1 = jcas.createView(REVISION_1);
            JCas revView2 = jcas.createView(REVISION_2);

            Revision revision1;
            Revision revision2;
            String text1 = "";
            String text2 = "";

            if (nrOfRevisionsProcessed < skipFirstNPairs) {
                if (nrOfRevisionsProcessed % 1000 == 0) {
                    this.getLogger().log(Level.INFO, "Skipping " + nrOfRevisionsProcessed + "th revision.");
                }
                // create fake revisions
                revision1 = getRevision(null);
                revision2 = getRevision(null);
            } else {
                revision1 = getRevision(savedTimestamp);
                revision2 = getRevision(currentTimestamp);

                text1 = getText(revision1);
                text2 = getText(revision2);

                int difference = Math.abs(text1.length() - text2.length());
                if (difference < minChange || difference > maxChange) {
                    text1 = "";
                    text2 = "";
                }
            }

            revView1.setDocumentText(text1);
            revView2.setDocumentText(text2);

            addDocumentMetaData(jcas, currentArticle.getPageId(), revision1.getRevisionID());
            addDocumentMetaData(revView1, currentArticle.getPageId(), revision1.getRevisionID());
            addDocumentMetaData(revView2, currentArticle.getPageId(), revision2.getRevisionID());

            addRevisionAnnotation(revView1, revision1);
            addRevisionAnnotation(revView2, revision2);

            savedTimestamp = currentTimestamp;

            if (!timestampIter.hasNext()) {
                savedTimestamp = null;
            }

            nrOfRevisionsProcessed++;
        } catch (WikiApiException e) {
            throw new CollectionException(e);
        } catch (CASException e) {
            throw new CollectionException(e);
        }
    }

    // TODO Use SWEBLE
    private String getText(Revision rev) {
        String text = rev.getRevisionText();

        if (outputPlainText) {
            text = StringEscapeUtils.unescapeHtml(text);

            ParsedPage pp = parser.parse(text);

            if (pp == null) {
                return "";
            }

            text = pp.getText();

            // text = WikiUtils.mediaWikiMarkup2PlainText(text);

            // replace multiple white space with single white space
            text = WikiUtils.cleanText(text);
        }

        return text;

    }

    private Revision getRevision(Timestamp timestamp) throws CollectionException {
        Revision revision;

        if (timestamp != null) {
            try {
                revision = this.revisionApi.getRevision(currentArticle.getPageId(), timestamp);
            } catch (WikiApiException e) {
                throw new CollectionException(e);
            }
        } else {
            revision = new Revision(0);
            revision.setArticleID(currentArticle.getPageId());
            revision.setComment("");
            revision.setContributorName("");
            revision.setContributorId(null);
            revision.setRevisionID(0);
            revision.setRevisionText("");
            revision.setTimeStamp(timestamp);
            revision.setMinor(false);
        }

        return revision;
    }
}