Source code

Java tutorial


Here is the source code for


package it.unimi.di.big.mg4j.document;

 * MG4J: Managing Gigabytes for Java (big)
 * Copyright (C) 2013-2015 Sebastiano Vigna 
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see <>.

import it.unimi.di.big.mg4j.tool.URLMPHVirtualDocumentResolver;
import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver;
import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor.Anchor;
import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.big.webgraph.ImmutableGraph;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.bits.TransformationStrategy;
import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.MWHCFunction;
import it.unimi.dsi.util.TextPattern;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ArrayBlockingQueue;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** A class exhibiting a standard Wikipedia XML dump as a {@link DocumentSequence}. 
 * <P><strong>Warning</strong>: this class has no connection whatsoever with
 * {@link WikipediaDocumentCollection}.
 * <p>The purpose of this class is making the indexing of Wikipedia and of its entity
 * graph starting from a pristine Wikipedia XML dump reasonably easy. There are a few
 * steps involved, mainly due to the necessity of working out redirects, but the whole
 * procedure can be carried out with very little resources. The class uses the
 * {@link WikiModel#toHtml(String, Appendable, String, String)} method to convert
 * the Wikipedia format into HTML, and then passes the result to a standard {@link HtmlDocumentFactory}
 * (suggestion on alternative conversion methods are welcome).
 * A few additional fields are handled by {@link WikipediaHeaderFactory}.
 * <p>Note that no properties are passed to the underlying {@link HtmlDocumentFactory}: if you want
 * to set the anchor properties (see {@link HtmlDocumentFactory.MetadataKeys}), you need to use
 * {@linkplain #WikipediaDocumentSequence(String, boolean, String, boolean, boolean, boolean, int, int, int) a quite humongous constructor}.
 * <h3>How to index Wikipedia</h3>
 * <p>As a first step, download the Wikipedia XML dump (it's the &ldquo;pages-articles&rdquo; file;
 * it should start with a <code>mediawiki</code> opening tag). This class can process the
 * file in its compressed form, but we suggest to uncompress it using <code>bunzip2</code>,
 * as processing is an order of magnitude faster. (Note that the following process will exclude namespaced
 * pages such as <code>Template:<var>something</var></code> and all templates, such as infoboxes, etc.; if you want to include them, you must 
 * use a different {@linkplain #WikipediaDocumentSequence(String, boolean, String, boolean, boolean, boolean) constructor}.)
 * <p>The first step is extracting metadata (in particular, the URLs that are necessary to
 * index correctly the anchor text). We do not suggest specific Java options, but try to use
 * as much memory as you can.
 * <pre>
 * java it.unimi.di.big.mg4j.tool.ScanMetadata \
 *   -o "it.unimi.di.big.mg4j.document.WikipediaDocumentSequence(enwiki-latest-pages-articles.xml,false,,false)" \
 *   -u enwiki.uris -t enwiki.titles
 * </pre>
 * <p>Note that we used the {@link ObjectParser}-based constructor of this class, which makes it possible to create
 * a {@link WikipediaDocumentSequence} instance parsing a textual specification (see the 
 * {@linkplain #WikipediaDocumentSequence(String, boolean, String, boolean) constructor}
 * documentation for details about the parameters).
 * <p>The second step consists in building a first {@link VirtualDocumentResolver} which, however, 
 * does not comprise redirect information:
 * <pre>
 * java it.unimi.di.big.mg4j.tool.URLMPHVirtualDocumentResolver -o enwiki.uris enwiki.vdr
 * </pre>
 * <p>Now we need to use the <i>ad hoc</i> main method of this class to rescan the collection, gather the redirect
 * information and merge it with our current resolver:
 * <pre>
 * java it.unimi.di.big.mg4j.document.WikipediaDocumentSequence \
 *   enwiki-latest-pages-articles.xml enwiki.uris enwiki.vdr enwikired.vdr
 * </pre>
 * <p>During this phase a quite large number of warnings about <em>failed redirects</em> might appear. This is normal,
 * in particular if you do not index template pages. If you suspect an actual bug, try first to index template pages,
 * too. Failed redirects should be in the order of few thousands, and all due to internal inconsistencies of
 * the dump: to check that this is the case, check whether the target of a failed redirect appears as a page
 * title (it shouldn't).
 * <p>We have now all information required to build a complete index (we use the Porter2 stemmer in this example):
 * <pre>
 * java it.unimi.di.big.mg4j.tool.IndexBuilder \
 *   -o "it.unimi.di.big.mg4j.document.WikipediaDocumentSequence(enwiki.xml,false,,true)" \ 
 *   --all-fields -v enwikired.vdr -t EnglishStemmer enwiki
 * </pre>
 * <p>Finally, we can build the entity graph using a bridge class that exposes any {@link DocumentSequence} with a virtual
 * field as an {@link ImmutableGraph} of the <a href="">WebGraph framework</a> (the nodes will be in one-to-one correspondence with the documents
 * returned by the index):
 * <pre>
 * java it.unimi.dsi.big.webgraph.BVGraph \
 *   -s "it.unimi.di.big.mg4j.util.DocumentSequenceImmutableSequentialGraph(\"it.unimi.di.big.mg4j.document.WikipediaDocumentSequence(enwiki.xml,false,,true)\",anchor,enwikired.vdr)" \ 
 *   enwiki
 * </pre>
 * <h2>Additional fields</h2>
 * <p>The additional fields generated by this class (some of which are a bit hacky) are:
 * <dl>
 * <dt><code>title</code>
 * <dd>the title of the Wikipedia page;
 * <dt><code>id</code>
 * <dd>a payload index containing the Wikipedia identifier of the page;
 * <dt><code>lastedit</code>
 * <dd>a payload index containing the last edit of the page;
 * <dt><code>category</code>
 * <dd>a field containing the categories of the page, separated by an artificial marker <code>OXOXO</code> (so when you look for a category as a phrase you
 * don't get false cross-category positives);
 * <dt><code>firstpar</code>
 * <dd>a heuristically generated first paragraph of the page, useful for identification beyond the title;
 * <dt><code>redirects</code>
 * <dd>a virtual field treating the link of the page with its title and any redirect link to the page as an anchor: in practice, the
 * field contains all names under which the page is known in Wikipedia. 
 * </dl>
 * <p>Note that for each link in a disambiguation page this class will generate a fake link with the same target, but
 * the title of the disambiguation page as text. This is in the same spirit of the <code>redirects</code> field&mdash;we enrich
 * the HTML <code>anchor</code> field with useful information without altering the generated graph.

public class WikipediaDocumentSequence extends AbstractDocumentSequence implements Serializable {
    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaDocumentSequence.class);
    private static final long serialVersionUID = 1L;

    private static final TextPattern CATEGORY_START = new TextPattern("[[Category:");
    private static final TextPattern BRACKETS_CLOSED = new TextPattern("]]");
    private static final TextPattern BRACES_CLOSED = new TextPattern("}}");
    private static final TextPattern DISAMBIGUATION = new TextPattern("{{disambiguation");
    private static final TextPattern BRACKETS_OPEN = new TextPattern("[[");
    private static final char[] END_OF_DISAMBIGUATION_LINK = new char[] { '|', ']' };

    /** A marker used to denote end of input. */
    private static final DocumentAndFactory END = new DocumentAndFactory(null, null);
    /** The prototype {@link CompositeDocumentFactory} used to parse Wikipedia pages. */
    private final DocumentFactory factory;
    /** Whether the input is compressed with <code>bzip2</code>. */
    private final boolean bzipped;
    /** Whether to parse text (e.g., we do not parse text when computing titles/URIs). */
    private final boolean parseText;
    /** Whether to keep in the index namespace pages. */
    private final boolean keepNamespaced;
    /** The Wikipedia XML dump. */
    private final String wikipediaXmlDump;
    /** The base URL for pages (e.g., <code></code>). */
    private final String baseURL;
    /** {@link #baseURL} concatenated with <code>${title}</code>. */
    private final String linkBaseURL;
    /** {@link #baseURL} concatenated with <code>${image}</code>. */
    private final String imageBaseURL;
    /** The set of namespaces specified in {@link #wikipediaXmlDump}. */
    private ImmutableSet<MutableString> nameSpaces;
    /** This list (whose access must be synchronized) accumulates virtual text (anchors) generated by redirects.
     * It is filled when meeting redirect pages, and it is emptied at the first non-redirect page (the page in which the list
     * is emptied is immaterial). Note that because of this setup, if there are some redirect 
     * pages that are not followed by any indexed page the anchors of those redirects won't be processed at all. 
     * If this is a problem, just add a fake empty page at the end. */
    private final ObjectArrayList<Anchor> redirectAnchors = new ObjectArrayList<Anchor>();
    /** The Bliki model used to parse pages. */
    private final MyWikiModel wikiModel;

    public static enum MetadataKeys {
        /** This key is used internally by {@link WikipediaHeaderFactory} and is associated with the list of redirect anchors. */

    /** A factory responsible for special Wikipedia fields (see the {@linkplain WikipediaDocumentSequence class documentation}). It
     * will be {@linkplain CompositeDocumentFactory composed} with an {@link HtmlDocumentFactory}. */
    public static final class WikipediaHeaderFactory extends AbstractDocumentFactory {
        private static final long serialVersionUID = 1L;
        private static final Object2IntOpenHashMap<String> FIELD_2_INDEX = new Object2IntOpenHashMap<String>(
                new String[] { "title", "id", "lastedit", "category", "firstpar", "redirect" },
                new int[] { 0, 1, 2, 3, 4, 5 });
        static {

        private final WordReader wordReader = new FastBufferedReader();

        public int numberOfFields() {
            return 6;

        public String fieldName(int field) {
            switch (field) {
            case 0:
                return "title";
            case 1:
                return "id";
            case 2:
                return "lastedit";
            case 3:
                return "category";
            case 4:
                return "firstpar";
            case 5:
                return "redirect";
                throw new IllegalArgumentException();

        public int fieldIndex(String fieldName) {
            return FIELD_2_INDEX.getInt(fieldName);

        public FieldType fieldType(int field) {
            switch (field) {
            case 0:
                return FieldType.TEXT;
            case 1:
                return FieldType.INT;
            case 2:
                return FieldType.DATE;
            case 3:
                return FieldType.TEXT;
            case 4:
                return FieldType.TEXT;
            case 5:
                return FieldType.VIRTUAL;
                throw new IllegalArgumentException();

        public Document getDocument(final InputStream unusedRawContent,
                final Reference2ObjectMap<Enum<?>, Object> metadata) throws IOException {
            return new AbstractDocument() {

                public WordReader wordReader(int field) {
                    return wordReader; // Fixed, for the time being.

                public CharSequence uri() {
                    return (CharSequence) metadata.get(PropertyBasedDocumentFactory.MetadataKeys.URI);

                public CharSequence title() {
                    return (CharSequence) metadata.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE);

                public Object content(final int field) throws IOException {
                    switch (field) {
                    case 0:
                        return new FastBufferedReader(
                                (MutableString) metadata.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE));
                    case 1:
                        return metadata.get(MetadataKeys.ID);
                    case 2:
                        return metadata.get(MetadataKeys.LASTEDIT);
                    case 3:
                        return new FastBufferedReader((MutableString) metadata.get(MetadataKeys.CATEGORY));
                    case 4:
                        return new FastBufferedReader((MutableString) metadata.get(MetadataKeys.FIRSTPAR));
                    case 5:
                        final ObjectArrayList<Anchor> redirectAnchors = (ObjectArrayList<Anchor>) metadata
                        ImmutableList<Anchor> result;

                        synchronized (redirectAnchors) {
                            redirectAnchors.add(new Anchor(
                                    (MutableString) metadata.get(PropertyBasedDocumentFactory.MetadataKeys.URI),
                                    (MutableString) metadata.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE)));
                            result = ImmutableList.copyOf(redirectAnchors);
                        return result;
                        throw new IllegalArgumentException();

        public DocumentFactory copy() {
            return new WikipediaHeaderFactory();


    /** An extension of {@link WikiModel} that is able, optionally to skip templates. */
    private static class MyWikiModel extends WikiModel {
        private final boolean keepTemplates;

        public MyWikiModel(String imageBaseURL, String linkBaseURL, final boolean keepTemplates) {
            super(imageBaseURL, linkBaseURL);
            this.keepTemplates = keepTemplates;

        public void substituteTemplateCall(String templateName, Map<String, String> parameterMap, Appendable writer)
                throws IOException {
            if (keepTemplates)
                super.substituteTemplateCall(templateName, parameterMap, writer);

    /** Builds a new Wikipedia document sequence using default anchor settings and discarding namespaced pages and templates.
     * @param file the file containing the Wikipedia dump.
     * @param bzipped whether {@code file} is compressed with <code>bzip2</code>.
     * @param baseURL a base URL for links (e.g., for the English Wikipedia, <code></code>);
     * note that if it is nonempty this string <strong>must</strong> terminate with a slash.
     * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning
     * phases to speed up the scanning process).
    public WikipediaDocumentSequence(final String file, final boolean bzipped, final String baseURL,
            final boolean parseText) {
        this(file, bzipped, baseURL, parseText, false, false);

    /** Builds a new Wikipedia document sequence using default anchor settings.
     * @param file the file containing the Wikipedia dump.
     * @param bzipped whether {@code file} is compressed with <code>bzip2</code>.
     * @param baseURL a base URL for links (e.g., for the English Wikipedia, <code></code>);
     * note that if it is nonempty this string <strong>must</strong> terminate with a slash.
     * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning
     * phases to speed up the scanning process).
     * @param keepNamespaced whether to keep namespaced pages (e.g., <code>Template:<var>something</var></code> pages).
    public WikipediaDocumentSequence(final String file, final boolean bzipped, final String baseURL,
            final boolean parseText, final boolean keepNamespaced, final boolean keepTemplates) {
        this(file, bzipped, baseURL, parseText, keepNamespaced, keepTemplates,
                HtmlDocumentFactory.DEFAULT_MAXPREANCHOR, HtmlDocumentFactory.DEFAULT_MAXANCHOR,

    /** Builds a new Wikipedia document sequence.
     * @param file the file containing the Wikipedia dump.
     * @param bzipped whether {@code file} is compressed with <code>bzip2</code>.
     * @param baseURL a base URL for links (e.g., for the English Wikipedia, <code></code>);
     * note that if it is nonempty this string <strong>must</strong> terminate with a slash.
     * @param parseText whether to parse the text (this parameter is only set to false during metadata-scanning
     * phases to speed up the scanning process).
     * @param keepNamespaced whether to keep namespaced pages (e.g., <code>Template:<var>something</var></code> pages).
     * @param keepTemplates whether to keep templates (e.g., infoboxes, taxoboxes, etc.); we suggest to pass false if you're building a Wikipedia graph.
     * @param maxPreAnchor maximum number of character before an anchor.
     * @param maxAnchor maximum number of character in an anchor.
     * @param maxPostAnchor maximum number of characters after an anchor.
    public WikipediaDocumentSequence(final String file, final boolean bzipped, final String baseURL,
            final boolean parseText, final boolean keepNamespaced, final boolean keepTemplates,
            final int maxPreAnchor, final int maxAnchor, final int maxPostAnchor) {
        this.wikipediaXmlDump = file;
        this.bzipped = bzipped;
        this.baseURL = baseURL;
        this.parseText = parseText;
        this.keepNamespaced = keepNamespaced;
        this.factory = new CompositeDocumentFactory(
                new DocumentFactory[] { new WikipediaHeaderFactory(),
                        new HtmlDocumentFactory(new Reference2ObjectOpenHashMap<Enum<?>, Object>(
                                new Enum[] { HtmlDocumentFactory.MetadataKeys.MAXPREANCHOR,
                                        HtmlDocumentFactory.MetadataKeys.MAXPOSTANCHOR },
                                new Integer[] { Integer.valueOf(maxPreAnchor), Integer.valueOf(maxAnchor),
                                        Integer.valueOf(maxPostAnchor) })) },
                new String[] { "title", "id", "lastedit", "category", "firstpar", "redirect", "text", "dummy",
                        "anchor" });
        linkBaseURL = baseURL + "${title}";
        imageBaseURL = baseURL + "${image}";
        wikiModel = new MyWikiModel(imageBaseURL, linkBaseURL, keepTemplates);

    /** A string-based constructor to be used with an {@link ObjectParser}.
     * @see #WikipediaDocumentSequence(String, boolean, String, boolean)
    public WikipediaDocumentSequence(final String file, final String bzipped, final String baseURL,
            final String parseText) {
        this(file, Boolean.parseBoolean(bzipped), baseURL, Boolean.parseBoolean(parseText));

    /** A string-based constructor to be used with an {@link ObjectParser}.
     * @see #WikipediaDocumentSequence(String, boolean, String, boolean, boolean, boolean)
    public WikipediaDocumentSequence(final String file, final String bzipped, final String baseURL,
            final String parseText, final String keepNamespaced, final String keepTemplates) {
        this(file, Boolean.parseBoolean(bzipped), baseURL, Boolean.parseBoolean(parseText),
                Boolean.parseBoolean(keepNamespaced), Boolean.parseBoolean(keepTemplates));

    /** A string-based constructor to be used with an {@link ObjectParser}.
     * @see #WikipediaDocumentSequence(String, boolean, String, boolean, boolean, boolean, int, int, int)
    public WikipediaDocumentSequence(final String file, final String bzipped, final String baseURL,
            final String parseText, final String keepNamespaced, final String keepTemplates,
            final String maxBeforeAnchor, final String maxAnchor, final String maxPostAnchor) {
        this(file, Boolean.parseBoolean(bzipped), baseURL, Boolean.parseBoolean(parseText),
                Boolean.parseBoolean(keepNamespaced), Boolean.parseBoolean(keepTemplates),
                Integer.parseInt(maxBeforeAnchor), Integer.parseInt(maxAnchor), Integer.parseInt(maxPostAnchor));

    private static final class DocumentAndFactory {
        public final Document document;
        public final DocumentFactory factory;

        public DocumentAndFactory(final Document document, final DocumentFactory documentFactory) {
            this.document = document;
            this.factory = documentFactory;

    public DocumentIterator iterator() throws IOException {
        final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        final MutableString nameSpaceAccumulator = new MutableString();
        final ObjectOpenHashSet<MutableString> nameSpacesAccumulator = new ObjectOpenHashSet<MutableString>();
        final ArrayBlockingQueue<DocumentFactory> freeFactories = new ArrayBlockingQueue<DocumentFactory>(16);
        for (int i = freeFactories.remainingCapacity(); i-- != 0;)
        final ArrayBlockingQueue<DocumentAndFactory> readyDocumentsAndFactories = new ArrayBlockingQueue<DocumentAndFactory>(

        final SAXParser parser;
        try {
            parser = saxParserFactory.newSAXParser();
        } catch (Exception e) {
            throw new RuntimeException(e.getMessage(), e);
        final DefaultHandler handler = new DefaultHandler() {
            private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
            private boolean inText;
            private boolean inTitle;
            private boolean inId;
            private boolean inTimestamp;
            private boolean inNamespaceDef;
            private boolean redirect;
            private MutableString text = new MutableString();
            private MutableString title = new MutableString();
            private MutableString id = new MutableString();
            private MutableString timestamp = new MutableString();
            private final Reference2ObjectMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap<Enum<?>, Object>();
                metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");
                metadata.put(MetadataKeys.REDIRECT, redirectAnchors);

            public void startElement(String uri, String localName, String qName, Attributes attributes)
                    throws SAXException {
                if ("page".equals(localName)) {
                    redirect = inText = inTitle = inId = inTimestamp = false;
                } else if ("text".equals(localName))
                    inText = true;
                else if ("title".equals(localName) && title.length() == 0)
                    inTitle = true; // We catch only the first id/title elements.
                else if ("id".equals(localName) && id.length() == 0)
                    inId = true;
                else if ("timestamp".equals(localName) && timestamp.length() == 0)
                    inTimestamp = true;
                else if ("redirect".equals(localName)) {
                    redirect = true;
                    if (attributes.getValue("title") != null)
                        // Accumulate the title of the page as virtual text of the redirect page.
                        synchronized (redirectAnchors) {
                            final String link = Encoder.encodeTitleToUrl(attributes.getValue("title"), true);
                                    new AnchorExtractor.Anchor(new MutableString(baseURL.length() + link.length())
                                            .append(baseURL).append(link), title.copy()));
                } else if ("namespace".equals(localName)) {
                    // Found a new namespace
                    inNamespaceDef = true;

            public void endElement(String uri, String localName, String qName) throws SAXException {
                if ("namespace".equals(localName)) { // Collecting a namespace
                    if (nameSpaceAccumulator.length() != 0)

                if ("namespaces".equals(localName)) { // All namespaces collected
                    nameSpaces = ImmutableSet.copyOf(nameSpacesAccumulator);

                if (!redirect) {
                    if ("title".equals(localName)) {
                        // Set basic metadata for the page
                        metadata.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, title.copy());
                        String link = Encoder.encodeTitleToUrl(title.toString(), true);
                                new MutableString(baseURL.length() + link.length()).append(baseURL).append(link));
                        inTitle = false;
                    } else if ("id".equals(localName)) {
                        metadata.put(MetadataKeys.ID, Long.valueOf(id.toString()));
                        inId = false;
                    } else if ("timestamp".equals(localName)) {
                        try {
                            metadata.put(MetadataKeys.LASTEDIT, dateFormat.parse(timestamp.toString()));
                        } catch (ParseException e) {
                            throw new RuntimeException(e.getMessage(), e);
                        inTimestamp = false;
                    } else if ("text".equals(localName)) {
                        inText = false;
                        if (!keepNamespaced) {
                            // Namespaces are case-insensitive and language-dependent
                            final int pos = title.indexOf(':');
                            if (pos != -1 && nameSpaces.contains(title.substring(0, pos).toLowerCase()))
                        try {
                            final MutableString html = new MutableString();
                            DocumentFactory freeFactory;
                            try {
                                freeFactory = freeFactories.take();
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e.getMessage(), e);
                            if (parseText) {
                                if ( != -1) { // It's a disambiguation page.
                                    /* Roi's hack: duplicate links using the page title, so the generic name will end up as anchor text. */
                                    final MutableString newLinks = new MutableString();
                                    for (int start = 0, end; (start =,
                                            start)) != -1; start = end) {
                                        end = start;
                                        final int endOfLink = text.indexOfAnyOf(END_OF_DISAMBIGUATION_LINK, start);
                                        // Note that we don't escape title because we are working at the Wikipedia raw text level.
                                        if (endOfLink != -1) {
                                            newLinks.append(text.array(), start, endOfLink - start).append('|')
                                            end = endOfLink;

                                // We separate categories by OXOXO, so we don't get overflowing phrases.
                                final MutableString category = new MutableString();
                                for (int start = 0, end; (start =,
                                        start)) != -1; start = end) {
                                    end =, start += CATEGORY_START.length());
                                    if (end != -1)
                                        category.append(text.subSequence(start, end)).append(" OXOXO ");
                                metadata.put(MetadataKeys.CATEGORY, category);

                                // Heuristics to get the first paragraph
                                metadata.put(MetadataKeys.FIRSTPAR, new MutableString());
                                String plainText = wikiModel.render(new PlainTextConverter(true), text.toString());
                                for (int start = 0; start < plainText.length(); start++) {
                                    //System.err.println("Examining " + plainText.charAt( start )  );
                                    if (Character.isWhitespace(plainText.charAt(start)))
                                    if (plainText.charAt(start) == '{') {
                                        //System.err.print( "Braces " + start + " text: \"" + plainText.subSequence( start, start + 10 )  + "\" -> " );
                                        start =, start);
                                        //System.err.println( start + " text: \"" + plainText.subSequence( start, start + 10 ) + "\"" );
                                        if (start == -1)
                                    } else if (plainText.charAt(start) == '[') {
                                        start =, start);
                                        if (start == -1)
                                    } else {
                                        final int end = plainText.indexOf('\n', start);
                                        if (end != -1)
                                                    new MutableString(plainText.substring(start, end)));

                                try {
                                    wikiModel.render(new HTMLConverter(), text.toString(), html, false, true);
                                    final Map<String, String> categories = wikiModel.getCategories();
                                    // Put back category links in the page (they have been parsed by bliki and to not appear anymore in the HTML rendering)
                                    for (Entry<String, String> entry : categories.entrySet()) {
                                        final String key = entry.getKey();
                                        final String value = entry.getValue().trim();
                                        if (value.length() != 0) // There are empty such things
                                            html.append("\n<a href=\"").append(baseURL).append("Category:")
                                                    .append(Encoder.encodeTitleToUrl(key, true)).append("\">")
                                } catch (Exception e) {
                                    LOGGER.error("Unexpected exception while parsing " + title, e);
                            readyDocumentsAndFactories.put(new DocumentAndFactory(
                                    freeFactory.getDocument(IOUtils.toInputStream(html, Charsets.UTF_8),
                                            new Reference2ObjectOpenHashMap<Enum<?>, Object>(metadata)),
                        } catch (InterruptedException e) {
                            throw new RuntimeException(e.getMessage(), e);
                        } catch (IOException e) {
                            throw new RuntimeException(e.getMessage(), e);

            public void characters(char[] ch, int start, int length) throws SAXException {
                if (inText && parseText)
                    text.append(ch, start, length);
                if (inTitle)
                    title.append(ch, start, length);
                if (inId)
                    id.append(ch, start, length);
                if (inTimestamp)
                    timestamp.append(ch, start, length);
                if (inNamespaceDef) {
                    nameSpaceAccumulator.append(ch, start, length);
                    inNamespaceDef = false; // Dirty, but it works

            public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
                if (inText && parseText)
                    text.append(ch, start, length);
                if (inTitle)
                    title.append(ch, start, length);

        final Thread parsingThread = new Thread() {
            public void run() {
                try {
                    InputStream in = new FileInputStream(wikipediaXmlDump);
                    if (bzipped)
                        in = new BZip2CompressorInputStream(in);
                            new InputSource(new InputStreamReader(new FastBufferedInputStream(in), Charsets.UTF_8)),
                } catch (Exception e) {
                    throw new RuntimeException(e.getMessage(), e);


        return new AbstractDocumentIterator() {
            private DocumentFactory lastFactory;

            public Document nextDocument() throws IOException {
                try {
                    final DocumentAndFactory documentAndFactory = readyDocumentsAndFactories.take();
                    if (lastFactory != null)
                    if (documentAndFactory == END)
                        return null;
                    lastFactory = documentAndFactory.factory;
                    return documentAndFactory.document;
                } catch (InterruptedException e) {
                    throw new RuntimeException(e.getMessage(), e);

    public DocumentFactory factory() {
        return factory;

    /** A wrapper around a signed function that remaps entries exceeding a provided threshold using a specified target array. */
    public static final class SignedRedirectedStringMap extends AbstractObject2LongFunction<CharSequence>
            implements StringMap<CharSequence> {
        private static final long serialVersionUID = 1L;
        /** The number of documents. */
        private final long numberOfDocuments;
        /** A signed function function mapping valid keys to their ordinal position. */
        private Object2LongFunction<CharSequence> signedFunction;
        /** The value to be returned for keys whose ordinal position is greater than {@link #numberOfDocuments}. */
        private final long[] target;

        /** Creates a new signed redirected map.
         * @param numberOfDocuments the threshold after which the {@code target} array will be used to compute the output.
         * @param signedFunction the base signed function.
         * @param target an array providing the output for items beyond {@code numberOfDocuments}; it must be
         * long as the size of {@code signedFunction} minus {@code numberOfDocuments}.
        public SignedRedirectedStringMap(final long numberOfDocuments,
                final Object2LongFunction<CharSequence> signedFunction, final long[] target) {
            this.numberOfDocuments = numberOfDocuments;
            this.signedFunction = signedFunction;
   = target;

        public long getLong(Object key) {
            final long index = signedFunction.getLong(key);
            if (index == -1)
                return -1;
            if (index < numberOfDocuments)
                return index;
            return target[(int) (index - numberOfDocuments)];

        public boolean containsKey(Object key) {
            return signedFunction.getLong(key) != -1;

        public long size64() {
            return numberOfDocuments;

        public int size() {
            return (int) Math.min(Integer.MAX_VALUE, size64());

        public ObjectBigList<? extends CharSequence> list() {
            return null;

    public static void main(final String arg[])
            throws ParserConfigurationException, SAXException, IOException, JSAPException, ClassNotFoundException {
        SimpleJSAP jsap = new SimpleJSAP(WikipediaDocumentSequence.class.getName(),
                "Computes the redirects of a Wikipedia dump and integrate them into an existing virtual document resolver for the dump.",
                new Parameter[] { new Switch("bzip2", 'b', "bzip2", "The file is compressed with bzip2"),
                        new Switch("iso", 'i', "iso",
                                "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."),
                        new FlaggedOption("width", JSAP.INTEGER_PARSER, Integer.toString(Long.SIZE),
                                JSAP.NOT_REQUIRED, 'w', "width",
                                "The width, in bits, of the signatures used to sign the function from URIs to their rank."),
                        new UnflaggedOption("file", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The file containing the Wikipedia dump."),
                        new UnflaggedOption("baseURL", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The base URL for the collection (e.g.,"),
                        new UnflaggedOption("uris", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The URIs of the documents in the collection (generated by ScanMetadata)."),
                        new UnflaggedOption("vdr", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The name of a precomputed virtual document resolver for the collection."),
                        new UnflaggedOption("redvdr", JSAP.STRING_PARSER, JSAP.REQUIRED,
                                "The name of the resulting virtual document resolver.") });

        JSAPResult jsapResult = jsap.parse(arg);
        if (jsap.messagePrinted())

        final SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
        final Object2ObjectOpenHashMap<MutableString, String> redirects = new Object2ObjectOpenHashMap<MutableString, String>();
        final String baseURL = jsapResult.getString("baseURL");
        final ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.itemsName = "redirects";
        progressLogger.start("Extracting redirects...");

        final SAXParser parser = saxParserFactory.newSAXParser();
        final DefaultHandler handler = new DefaultHandler() {
            private boolean inTitle;
            private MutableString title = new MutableString();

            public void startElement(String uri, String localName, String qName, Attributes attributes)
                    throws SAXException {
                if ("page".equals(localName)) {
                    inTitle = false;
                } else if ("title".equals(localName) && title.length() == 0)
                    inTitle = true; // We catch only the first title element.
                else if ("redirect".equals(localName) && attributes.getValue("title") != null) {
                    redirects.put(title.copy(), attributes.getValue("title"));

            public void endElement(String uri, String localName, String qName) throws SAXException {
                if ("title".equals(localName))
                    inTitle = false;

            public void characters(char[] ch, int start, int length) throws SAXException {
                if (inTitle)
                    title.append(ch, start, length);

            public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
                if (inTitle)
                    title.append(ch, start, length);

        InputStream in = new FileInputStream(jsapResult.getString("file"));
        if (jsapResult.userSpecified("bzip2"))
            in = new BZip2CompressorInputStream(in);
        parser.parse(new InputSource(new InputStreamReader(new FastBufferedInputStream(in), Charsets.UTF_8)),

        final Object2LongLinkedOpenHashMap<MutableString> resolved = new Object2LongLinkedOpenHashMap<MutableString>();
        final VirtualDocumentResolver vdr = (VirtualDocumentResolver) BinIO.loadObject(jsapResult.getString("vdr"));

        progressLogger.expectedUpdates = redirects.size();
        progressLogger.start("Examining redirects...");

        for (Map.Entry<MutableString, String> e : redirects.entrySet()) {
            final MutableString start = new MutableString().append(baseURL)
                    .append(Encoder.encodeTitleToUrl(e.getKey().toString(), true));
            final MutableString end = new MutableString().append(baseURL)
                    .append(Encoder.encodeTitleToUrl(e.getValue(), true));
            final long s = vdr.resolve(start);
            if (s == -1) {
                final long t = vdr.resolve(end);
                if (t != -1)
                    resolved.put(start.copy(), t);
                    LOGGER.warn("Failed redirect: " + start + " -> " + end);
            } else
                LOGGER.warn("URL " + start + " is already known to the virtual document resolver");




        final Iterable<MutableString> allURIs = Iterables
                .concat(new FileLinesCollection(jsapResult.getString("uris"), "UTF-8"), resolved.keySet());
        final long numberOfDocuments = vdr.numberOfDocuments();

        final TransformationStrategy<CharSequence> transformationStrategy = jsapResult.userSpecified("iso")
                ? TransformationStrategies.iso()
                : TransformationStrategies.utf16();

        BinIO.storeObject(new URLMPHVirtualDocumentResolver(new SignedRedirectedStringMap(numberOfDocuments,
                new ShiftAddXorSignedStringMap(allURIs.iterator(),
                        new MWHCFunction.Builder<CharSequence>().keys(allURIs).transform(transformationStrategy)
                resolved.values().toLongArray())), jsapResult.getString("redvdr"));