org.dbpedia.spotlight.lucene.index.external.utils.SSEIndexUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.dbpedia.spotlight.lucene.index.external.utils.SSEIndexUtils.java

Source

/**
 * SSE - A Knowledge Discovery Application
 *
 * Copyright (C) 2014 Federico Cairo, Giuseppe Futia, Federico Benedetto
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

package org.dbpedia.spotlight.lucene.index.external.utils;

import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.*;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Properties;

public class SSEIndexUtils {

    final static Log LOG = LogFactory.getLog(SSEIndexUtils.class);

    public static void main(String[] args) throws IOException {
        Properties config = new Properties();
        String confFile = args[0];
        config.load(new FileInputStream(new File(confFile)));
        //correctWikipediaDump(config);
        decodeURI(config, "conceptURIsToDecode", "conceptURIs");
        decodeURI(config, "redirectsTCToDecode", "redirectsTC");
        decodeURI(config, "surfaceFormsToDecode", "surfaceForms");
    }

    public static void correctWikipediaDump(Properties config) {

        LOG.info("Start to correct the Wikipedia dump...");

        String originalDumpPath = config.getProperty("org.dbpedia.spotlight.data.originalWikipediaDump", "").trim();
        String dumpPath = config.getProperty("org.dbpedia.spotlight.data.wikipediaDump", "").trim();

        final BufferedReader reader;
        final BufferedWriter writer;

        String line;

        try {
            reader = getBufferedReaderForBZ2File(originalDumpPath);
            writer = new BufferedWriter(new FileWriter(dumpPath));

            int rows = 0;
            while ((line = reader.readLine()) != null) {
                if (line.contains("<ns>118</ns>") || line.contains("<ns>446</ns>") || line.contains("<ns>447</ns>")
                        || line.contains("<ns>710</ns>") || line.contains("<ns>711</ns>")
                        || line.contains("<ns>828</ns>") || line.contains("<ns>829</ns>")) {
                    LOG.info("Correct row " + rows + ":" + line);
                    line = line.replaceAll("[0-9]+/*\\.*[0-9]*", "0");
                    line = line.replaceAll("-", "");
                    writer.write(line);
                } else {
                    writer.write(line);
                }
                writer.newLine();
                rows += 1;
                if (rows % 10000000 == 0) {
                    LOG.info("Processed " + rows + " rows");
                }
            }
            writer.close();
            LOG.info("Done");

        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void decodeURI(Properties config, String toDecode, String decoded) {

        LOG.info("Start to decode URIs...");

        final BufferedReader reader;
        final BufferedWriter writer;
        final Path src = Paths.get(config.getProperty("org.dbpedia.spotlight.data." + toDecode, "").trim());
        final Path dst = Paths.get(config.getProperty("org.dbpedia.spotlight.data." + decoded, "").trim());
        String line;

        try {
            reader = Files.newBufferedReader(src, StandardCharsets.UTF_8);
            writer = Files.newBufferedWriter(dst, StandardCharsets.UTF_8);

            while ((line = reader.readLine()) != null) {
                line = line.replaceAll("%(?![0-9a-fA-F]{2})", "%25");
                line = line.replaceAll("\\+", "%2B");
                writer.write(URLDecoder.decode(line, "UTF-8"));
                writer.newLine();
            }
            writer.close();

            LOG.info("Done");

        } catch (Exception e) {
            e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
        }
    }

    public static BufferedReader getBufferedReaderForBZ2File(String fileIn)
            throws IOException, CompressorException {
        FileInputStream fin = new FileInputStream(fileIn);
        BZip2CompressorInputStream input = new BZip2CompressorInputStream(fin, true);
        BufferedReader br2 = new BufferedReader(new InputStreamReader(input));
        return br2;
    }
}