org.tallison.wikilinks.ExternalLinkExtractor.java Source code

Java tutorial

Introduction

Here is the source code for org.tallison.wikilinks.ExternalLinkExtractor.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tallison.wikilinks;

import org.apache.commons.lang.StringUtils;
import org.tallison.URLUtil;

import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.GZIPInputStream;

/**
 * Class to extract and filter links from
 * wikipedia external link dumps
 */
public class ExternalLinkExtractor {
    private final static int BACKSLASH = (int) '\\';
    private final static int COMMA = (int) ',';
    private final static int OPEN_PAREN = (int) '(';
    private final static int CLOSE_PAREN = (int) ')';
    private final static int SQUOTE = (int) '\'';
    URLUtil urlUtil = new URLUtil();

    public static void main(String[] args) throws Exception {
        ExternalLinkExtractor ex = new ExternalLinkExtractor();
        Path gzLinksFile = Paths.get(args[0]);
        Path table = Paths.get(args[1]);
        ex.execute(gzLinksFile, table);
    }

    private void execute(Path gzLinksFile, Path table) throws IOException {
        BufferedWriter writer = Files.newBufferedWriter(table, StandardCharsets.UTF_8);
        int recordCount = 0;
        try (InputStream is = new BufferedInputStream(
                new GZIPInputStream(Files.newInputStream(gzLinksFile), 10000000))) {
            readToVALUES(is);
            boolean keepGoing = true;
            while (keepGoing) {
                int chr = is.read();

                switch (chr) {
                case OPEN_PAREN:
                    readRecord(is, writer);
                    recordCount++;
                    if (recordCount % 1000 == 0) {
                        System.out.println(recordCount);
                    }
                    break;
                case -1:
                    keepGoing = false;
                    break;
                }
            }
        } finally {
            writer.flush();
            writer.close();
        }
    }

    private void readToVALUES(InputStream is) throws IOException {
        //abomination
        while (true) {
            int c = is.read();
            if (c == (int) 'V') {
                c = is.read();
                if (c == (int) 'A') {
                    c = is.read();
                    if (c == (int) 'L') {
                        c = is.read();
                        if (c == (int) 'U') {
                            c = is.read();
                            if (c == (int) 'E') {
                                c = is.read();
                                if (c == (int) 'S') {
                                    return;
                                }
                            }
                        }
                    }
                }
                if (c == -1) {
                    throw new IOException("EOF reached before finding Values");
                }
            }
        }
    }

    private void readRecord(InputStream is, BufferedWriter writer) throws IOException {
        boolean keepGoing = true;
        int colNumber = 0;
        OutputStream bos = new ByteArrayOutputStream();
        while (keepGoing) {
            int chr = is.read();
            switch (chr) {
            case -1:
                throw new IOException("EOF reading record");
            case SQUOTE:
                readToSquote(colNumber, is, bos);
                break;
            case CLOSE_PAREN:
                handleCell(colNumber, bos, writer);
                return;
            case COMMA:
                handleCell(colNumber, bos, writer);
                ((ByteArrayOutputStream) bos).reset();
                colNumber++;
                break;
            default:
                if (colNumber == 3) {
                    bos.write(chr);
                }
                break;
            }
        }
    }

    private void handleCell(int colNumber, OutputStream bos, BufferedWriter writer) throws IOException {
        if (colNumber != 3) {
            return;
        }

        String urlString = new String(((ByteArrayOutputStream) bos).toByteArray(), StandardCharsets.UTF_8);
        String origString = urlString;
        if (urlString.startsWith("//")) {
            urlString = "http:" + urlString;
        }
        urlString = urlString.replaceAll("\\s+", " ");//get rid of new lines and tabs
        urlString = urlUtil.clean(urlString);
        if (!StringUtils.isBlank(urlString)) {
            URI uri = null;
            try {
                uri = new URI(urlString);
            } catch (URISyntaxException e) {

            }
            String host = (uri == null) ? "" : uri.getHost();
            writer.write(host + "\t" + urlString);
            writer.newLine();
        }
    }

    private void readToSquote(int colNumber, InputStream is, OutputStream bos) throws IOException {
        boolean keepGoing = true;
        while (keepGoing) {
            int chr = is.read();
            switch (chr) {
            case -1:
                throw new IOException("HIT EOF before end of squoted cell");
            case BACKSLASH:
                int nxt = is.read();
                if (nxt == -1) {
                    throw new IOException("EOF in squoted cell after backslash");
                } else if (colNumber == 3) {
                    bos.write(nxt);
                }
                break;
            case SQUOTE:
                return;
            default:
                if (colNumber == 3) {
                    bos.write(chr);
                }
            }
        }
    }

}