dkpro.similarity.algorithms.wikipedia.measures.WikiLinkCache.java Source code

Java tutorial

Introduction

Here is the source code for dkpro.similarity.algorithms.wikipedia.measures.WikiLinkCache.java

Source

/*******************************************************************************
 * Copyright 2012
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package dkpro.similarity.algorithms.wikipedia.measures;

import gnu.trove.TIntObjectHashMap;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.wikipedia.api.MetaData;
import de.tudarmstadt.ukp.wikipedia.api.Page;
import de.tudarmstadt.ukp.wikipedia.api.Wikipedia;
import de.tudarmstadt.ukp.wikipedia.api.exception.WikiApiException;

public class WikiLinkCache {

    private final Log logger = LogFactory.getLog(getClass());

    // maps pageIds to a list of inlinks represented by pageIds
    protected TIntObjectHashMap<int[]> cachedInLinks;

    private Wikipedia wiki;

    public WikiLinkCache(Wikipedia wiki) {
        this.wiki = wiki;
        this.cachedInLinks = new TIntObjectHashMap<int[]>();
    }

    public boolean isCacheEmpty() {
        if (cachedInLinks.size() == 0) {
            return true;
        } else {
            return false;
        }
    }

    public void fillInLinkCache()
            throws WikiApiException, FileNotFoundException, IOException, ClassNotFoundException {

        logger.info("Filling InlinkCache ...");

        if (!isCacheEmpty()) {
            return;
        }

        File serializedCacheFile = getSerializedCacheFile(wiki);
        if (serializedCacheFile.exists()) {
            cachedInLinks = (TIntObjectHashMap<int[]>) deserializeObject(serializedCacheFile);
            return;
        }

        int i = 0;
        for (Page article : wiki.getArticles()) {

            int[] inlinkIdIntArray = WikiLinkComparator.getInlinkIds(article);

            int pageId = article.getPageId();
            cachedInLinks.put(pageId, inlinkIdIntArray);

            if (i % 10000 == 0) {
                System.out.print(".");
            }
            i++;
        }
        System.out.println();

        serializeObject(cachedInLinks, serializedCacheFile);
    }

    public File getSerializedCacheFile(Wikipedia wiki) throws WikiApiException {
        MetaData metaData = wiki.getMetaData();
        StringBuilder sb = new StringBuilder();
        sb.append("WikipediaInlinkCache_");
        sb.append(metaData.getLanguage());
        sb.append("_");
        sb.append(metaData.getVersion());
        return new File(sb.toString());
    }

    /**
     * Serializes the cache and saves it to the given file.
     *
     * @param file the file to save the cache to
     * @throws IOException
     * @throws FileNotFoundException
     * @throws IOException if the file cannot be read
     */
    public void serializeObject(Object o, File file) throws FileNotFoundException, IOException {
        logger.info("Writing cache to file: " + file.getAbsolutePath());
        ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file));
        oos.writeObject(o);
        oos.flush();
        oos.close();
    }

    /**
     * Loads the cache from file
     *
     * @param file
     * @throws IOException
     * @throws FileNotFoundException
     * @throws ClassNotFoundException
     * @throws Exception
     */
    public Object deserializeObject(File file) throws FileNotFoundException, IOException, ClassNotFoundException {
        logger.info("Loading cache from file: " + file.getAbsolutePath());

        ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file));
        Object o = ois.readObject();
        ois.close();

        logger.info("Done.");

        return o;
    }
}