de.gesundkrank.wikipedia.hadoop.parser.Parser.java Source code

Java tutorial

Introduction

Here is the source code for de.gesundkrank.wikipedia.hadoop.parser.Parser.java

Source

/*
 * This file is part of wikipedia-hadoop.
 * Copyright (C) 2015 Jan Graegger
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see [http://www.gnu.org/licenses/].
 */

package de.gesundkrank.wikipedia.hadoop.parser;

import de.gesundkrank.wikipedia.hadoop.WikiPageWritable;
import de.gesundkrank.wikipedia.hadoop.WikiRevisionContributor;
import de.gesundkrank.wikipedia.hadoop.WikiRevisionWritable;
import org.apache.commons.lang3.StringEscapeUtils;

import javax.xml.bind.DatatypeConverter;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Jan Graegger<jan.grassegger@uni-weimar.de>
 */
public class Parser {
    public static final String PAGE_START = "<page>";
    private static final char NEWLINE = '\n';

    private static final Pattern TITLE_PATTERN = Pattern.compile(".*<title>(.+)</title>.*"),
            ID_PATTERN = Pattern.compile(".*<id>([0-9]+)</id>.*"),
            REDIRECT_PATTERN = Pattern.compile(".*<redirect.*/>.*"),
            REVISION_PATTERN = Pattern.compile(".*<revision>.*"),
            REVISION_END_PATTERN = Pattern.compile(".*</revision>.*"),
            TIMESTAMP_PATTERN = Pattern.compile(".*<timestamp>(.+)</timestamp>"),
            CONTRIBUTOR_PATTERN = Pattern.compile(".*<contributor>.*"),
            USERNAME_PATTERN = Pattern.compile(".*<username>(.+)</username>.*"),
            COMMENT_PATTERN = Pattern.compile(".*<comment>(.+)</comment>.*"),
            TEXT_BEGIN_PATTERN = Pattern.compile(".*<text xml:space=\"preserve\">(.+)"),
            TEXT_END_PATTERN = Pattern.compile("(.+)</text>.*"),
            IS_MINOR_PATTERN = Pattern.compile(".*<minor />.*"), PAGE_END_PATTERN = Pattern.compile(".*</page>.*"),
            CONTRIBUTOR_END_PATTERN = Pattern.compile(".*</contributor>.*");

    private WikiRevisionWritable revision;
    private WikiPageWritable currentPage;
    private boolean foundRevisionId, foundContributor, foundComment, foundText, foundTimestamp, foundMinor,
            foundPageId, foundPageTitle, foundPageRedirect;

    public Parser() {
        currentPage = null;
    }

    private void resetMarkers() {
        foundRevisionId = false;
        foundContributor = false;
        foundComment = false;
        foundText = false;
        foundTimestamp = false;
        foundMinor = false;
    }

    private void resetPageMarkers() {
        foundPageId = false;
        foundPageTitle = false;
        foundPageRedirect = false;
    }

    public WikiRevisionWritable readNextRevision(BufferedReader in) throws IOException {
        resetMarkers();

        revision = new WikiRevisionWritable(currentPage);

        while (in.ready()) {
            String line = in.readLine();

            boolean isPageStart = line.trim().startsWith(PAGE_START);

            if (currentPage == null && !isPageStart) {
                continue;
            }

            if (isPageStart) {
                readNextPage(in);
                revision.setPage(currentPage);
            }

            if (readRevisionId(line) || readTimeStamp(line) || readContributor(line, in) || readComment(line)
                    || readMinor(line) || readText(line, in)) {
                continue;
            }

            Matcher revisionEndMatcher = REVISION_END_PATTERN.matcher(line);
            if (revisionEndMatcher.matches()) {
                break;
            }
        }

        return revision;
    }

    private boolean readRevisionId(String line) {
        if (!foundRevisionId) {
            long id = matchId(line);
            if (id != -1) {
                revision.setId(id);
                foundRevisionId = true;
                return true;
            }
        }

        return false;
    }

    private boolean readTimeStamp(String line) {
        if (!foundTimestamp) {
            long timestamp = matchTimestamp(line);
            if (timestamp != -1) {
                revision.setTimestamp(timestamp);
                foundTimestamp = true;
                return true;
            }
        }
        return false;
    }

    private boolean readContributor(String line, BufferedReader in) throws IOException {
        if (!foundContributor && matchContributor(line)) {
            foundContributor = true;

            WikiRevisionContributor contributor = new WikiRevisionContributor();

            boolean foundUsername = false;
            boolean foundContributorId = false;

            while (in.ready()) {
                line = in.readLine();

                if (!foundUsername) {
                    String username = matchUsername(line);
                    if (username != null) {
                        contributor.setUsername(username);
                        foundUsername = true;
                        continue;
                    }
                }

                if (!foundContributorId) {
                    long id = matchId(line);
                    if (id != -1) {
                        contributor.setId(id);
                        foundContributorId = true;
                        continue;
                    }
                }

                if (matchContributorEnd(line)) {
                    break;
                }
            }
            revision.setContributor(contributor);
            return true;
        }
        return false;
    }

    private boolean readComment(String line) {
        if (!foundComment) {
            String comment = matchComment(line);
            if (comment != null) {
                revision.setComment(comment);
                foundComment = true;
                return true;
            }
        }
        return false;
    }

    private boolean readMinor(String line) {
        if (!foundMinor && matchMinor(line)) {
            revision.setMinor(true);
            return true;
        }
        return false;
    }

    private boolean readText(String line, BufferedReader in) throws IOException {
        if (!foundText) {
            line = matchTextBegin(line);
            if (line != null) {
                StringBuilder text = new StringBuilder();
                do {
                    Matcher textEMatcher = TEXT_END_PATTERN.matcher(line);
                    if (textEMatcher.matches()) {
                        text.append(textEMatcher.group(1));
                        break;
                    }
                    line = StringEscapeUtils.unescapeXml(line);

                    text.append(line).append(NEWLINE);
                    line = in.readLine();
                } while (line != null);

                foundText = true;
                revision.setText(text.toString());
                return true;
            }
        }

        return false;
    }

    private void readNextPage(BufferedReader in) throws IOException {
        resetPageMarkers();

        currentPage = new WikiPageWritable();

        while (in.ready()) {

            String line = in.readLine();

            if (readPageTitle(line) || readPageId(line) || readPageRedirect(line)) {
                continue;
            }

            //revision
            if (matchRevision(line)) {
                break;
            }

            Matcher pageEndMatcher = PAGE_END_PATTERN.matcher(line);
            if (pageEndMatcher.matches()) {
                break;
            }
        }
    }

    private boolean readPageTitle(String line) {
        if (!foundPageTitle) {
            String title = matchTitle(line);
            if (title != null) {
                currentPage.setTitle(title);
                foundPageTitle = true;
                return true;
            }
        }

        return false;
    }

    private boolean readPageId(String line) {
        if (!foundPageId) {
            long id = matchId(line);
            if (id != -1) {
                currentPage.setId(id);
                foundPageId = true;
                return true;
            }
        }
        return false;
    }

    private boolean readPageRedirect(String line) {
        if (!foundPageRedirect && matchRedirect(line)) {
            currentPage.setRedirect(true);
            foundPageRedirect = true;
            return true;
        }

        return false;
    }

    private static String matchTitle(String line) {
        Matcher titleMatcher = TITLE_PATTERN.matcher(line);
        if (titleMatcher.matches()) {
            return titleMatcher.group(1);
        }
        return null;
    }

    private static long matchId(String line) {
        Matcher idMatcher = ID_PATTERN.matcher(line);
        if (idMatcher.matches()) {
            return Long.parseLong(idMatcher.group(1));
        }
        return -1;
    }

    private static boolean matchRedirect(String line) {
        Matcher redirectMatcher = REDIRECT_PATTERN.matcher(line);
        return redirectMatcher.matches();
    }

    public static boolean matchRevision(String line) {
        Matcher revisionMatcher = REVISION_PATTERN.matcher(line);
        return revisionMatcher.matches();
    }

    private static long matchTimestamp(String line) {
        Matcher timestampMatcher = TIMESTAMP_PATTERN.matcher(line);
        if (timestampMatcher.matches()) {
            String timeString = timestampMatcher.group(1);
            return DatatypeConverter.parseDateTime(timeString).getTimeInMillis();
        }
        return -1;
    }

    private static boolean matchContributor(String line) {
        Matcher contributorMatcher = CONTRIBUTOR_PATTERN.matcher(line);
        return contributorMatcher.matches();
    }

    private static String matchUsername(String line) {
        Matcher userNameMatcher = USERNAME_PATTERN.matcher(line);
        if (userNameMatcher.matches()) {
            return userNameMatcher.group(1);
        }
        return null;
    }

    private static boolean matchContributorEnd(String line) {
        Matcher contributorEndMatcher = CONTRIBUTOR_END_PATTERN.matcher(line);
        return contributorEndMatcher.matches();
    }

    private static String matchComment(String line) {
        Matcher commentMatcher = COMMENT_PATTERN.matcher(line);
        if (commentMatcher.matches()) {
            return commentMatcher.group(1);
        }
        return null;
    }

    private static String matchTextBegin(String line) {
        Matcher textBeginMatcher = TEXT_BEGIN_PATTERN.matcher(line);
        if (textBeginMatcher.matches()) {
            return textBeginMatcher.group(1);
        }
        return null;
    }

    private static boolean matchMinor(String line) {
        Matcher isMinorMatcher = IS_MINOR_PATTERN.matcher(line);
        return isMinorMatcher.matches();
    }
}