de.ist.clonto.webwiki.InfoboxParser.java Source code

Java tutorial

Introduction

Here is the source code for de.ist.clonto.webwiki.InfoboxParser.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package de.ist.clonto.webwiki;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.validator.routines.UrlValidator;

import de.ist.clonto.webwiki.model.Information;
import de.ist.clonto.webwiki.model.Property;

/**
 *
 * @author Marcel
 *
 * Referencing to
 * https://code.google.com/p/gwtwiki/source/browse/trunk/info.bliki.wiki/bliki-core/src/main/java/info/bliki/wiki/dump/WikiPatternMatcher.java?r=1349
 * modified the idea here since multiple infoboxes can exist and they are only
 * able to retrieve one infobox per article sample: 'XML'. Provided a test
 * method in MyCrawlerTest.
 */
public class InfoboxParser {

    public List<Information> parse(String text) {
        String pagetext = replaceHTMLComments(text);
        List<Information> setlist = new ArrayList<Information>();

        Pattern pattern = Pattern.compile("\\{\\{\\s*infobox",
                Pattern.MULTILINE | Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
        Matcher matcher = pattern.matcher(pagetext);
        while (matcher.find()) {
            int begin = matcher.start();
            int bracketnr = 2;
            int end = begin + matcher.group().length();
            while (end < pagetext.length()) {
                switch (pagetext.charAt(end)) {
                case '}':
                    bracketnr--;
                    break;
                case '{':
                    bracketnr++;
                    break;
                }
                if (bracketnr == 0) {
                    break;
                }
                end++;
            }
            String infobox = pagetext.substring(begin, end);
            Information info = parseSet(infobox);
            setlist.add(info);
        }

        return setlist;
    }

    public Information parseSet(String infoboxtext) {

        Information information = new Information();

        String text = filterInfoboxMarkup(infoboxtext);

        String ibname = retrieveInfoboxName(text);
        information.setName(ibname);

        //Match attributes
        Pattern pattern = Pattern.compile("[^|]*=.*");
        Matcher matcher = pattern.matcher(text);

        while (matcher.find()) {
            String match = matcher.group();
            match = match.trim();
            String[] parts = match.split("=");

            if (parts.length == 1) {
                continue;
            }

            //parts[0] is the name, parts[1] the value
            String name = parts[0].trim();

            String value = parts[1].trim();

            for (String val : value.split(",")) {
                Property at = new Property();
                at.setName(name);
                at.setValue(val.trim());
                information.addProperty(at);
            }
        }

        return information;
    }

    private String filterInfoboxMarkup(String text) {
        text = replaceWikiAnchorsWithNames(text);
        text = removeExternalAnchors(text);
        text = removeReferences(text);
        return text;
    }

    private String replaceWikiAnchorsWithNames(String text) {
        //remove wikianchors and replace with their first value
        Pattern anchorPattern = Pattern.compile("\\[\\[.*?\\]\\]");
        Matcher anchorMatcher = anchorPattern.matcher(text);

        while (anchorMatcher.find()) {
            String anchor = anchorMatcher.group();
            String manchor = anchor.replaceAll("\\[", "");
            manchor = manchor.replaceAll("\\]", "");
            String[] manchorParts = manchor.split("\\|");
            text = text.replace(anchor, manchorParts[0]);
        }

        return text;
    }

    private String removeExternalAnchors(String text) {
        Pattern eanchorPattern = Pattern.compile("\\[.*?\\]");
        Matcher eanchorMatcher = eanchorPattern.matcher(text);

        while (eanchorMatcher.find()) {
            String anchor = eanchorMatcher.group();
            String manchor = anchor.replaceAll("\\[", "");
            manchor = manchor.replaceAll("\\]", "");
            String[] manchorParts = manchor.split("\\s");
            UrlValidator validator = new UrlValidator();
            if (validator.isValid(manchorParts[0])) {
                text = text.replace(anchor, manchor);
            }
        }

        return text;
    }

    private String removeReferences(String text) {
        text = Pattern.compile("<ref name=\".*?\">.*?</ref>", Pattern.MULTILINE | Pattern.DOTALL).matcher(text)
                .replaceAll("");
        text = Pattern.compile("<ref name=\".*?\"\\s/>", Pattern.MULTILINE | Pattern.DOTALL).matcher(text)
                .replaceAll("");

        return text;
    }

    private String replaceHTMLComments(String text) {
        text = Pattern.compile("<!--.*?-->", Pattern.MULTILINE | Pattern.DOTALL).matcher(text).replaceAll("");

        return text;
    }

    private String retrieveInfoboxName(String text) {
        String headline = text.split("\n")[0];
        headline = text.split("\\|")[0]; // demo case: infobox vg|title="something"
        headline = headline.replaceAll("\\{", "");
        headline = headline.toLowerCase();
        headline = headline.replace("infobox", "");
        return headline.trim();
    }

}