marytts.tools.dbselection.WikipediaMarkupCleaner.java Source code

Java tutorial

Introduction

Here is the source code for marytts.tools.dbselection.WikipediaMarkupCleaner.java

Source

/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.tools.dbselection;

import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Vector;

import org.apache.commons.lang.StringEscapeUtils;

/**
 * WikipediaMarkupCleaner
 * 
 * @author Marcela Charfuelan.
 */
public class WikipediaMarkupCleaner {

    // locale
    private String locale = null;
    // mySql database 
    private String mysqlHost = null;
    private String mysqlDB = null;
    private String mysqlUser = null;
    private String mysqlPasswd = null;
    // Wikipedia files:
    private String xmlWikiFile = null;
    private String wikiLog = null;
    private boolean debug = false;
    private String debugPageId = null;
    // Default settings for max page length and min and max text length
    private int minPageLength = 10000; // minimum size of a wikipedia page, to be used in the first filtering of pages
    private int minTextLength = 1000;
    private int maxTextLength = 15000; // the average length in one big xml file is approx. 12000

    // Use this variable to save time not loading Wiki tables, if they already exist in the DB
    private boolean loadWikiTables = true;

    // Use this variable to do NOT create a new cleanText table, but adding to an already existing cleanText table.
    private boolean deleteCleanTextTable = true;

    public void setLocale(String str) {
        locale = str;
    }

    public void setMysqlHost(String str) {
        mysqlHost = str;
    }

    public void setMysqlDB(String str) {
        mysqlDB = str;
    }

    public void setMysqlUser(String str) {
        mysqlUser = str;
    }

    public void setMysqlPasswd(String str) {
        mysqlPasswd = str;
    }

    public void setXmlWikiFile(String str) {
        xmlWikiFile = str;
    }

    public void setWikiLog(String str) {
        wikiLog = str;
    }

    public void setTestId(String str) {
        debugPageId = str;
    }

    public void setMinPageLength(int val) {
        minPageLength = val;
    }

    public void setMinTextLength(int val) {
        minTextLength = val;
    }

    public void setMaxTextLength(int val) {
        maxTextLength = val;
    }

    public void setDebug(boolean bval) {
        debug = bval;
    }

    public void setLoadWikiTables(boolean bval) {
        loadWikiTables = bval;
    }

    public void setDeleteCleanTextTable(boolean bval) {
        deleteCleanTextTable = bval;
    }

    public String getLocale() {
        return locale;
    }

    public String getMysqlHost() {
        return mysqlHost;
    }

    public String getMysqlDB() {
        return mysqlDB;
    }

    public String getMysqlUser() {
        return mysqlUser;
    }

    public String getMysqlPasswd() {
        return mysqlPasswd;
    }

    public String getXmlWikiFile() {
        return xmlWikiFile;
    }

    public String getWikiLog() {
        return wikiLog;
    }

    public String getTestId() {
        return debugPageId;
    }

    public int getMinPageLength() {
        return minPageLength;
    }

    public int getMinTextLength() {
        return minTextLength;
    }

    public int getMaxTextLength() {
        return maxTextLength;
    }

    public boolean getDebug() {
        return debug;
    }

    public boolean getLoadWikiTables() {
        return loadWikiTables;
    }

    public boolean getDeleteCleanTextTable() {
        return deleteCleanTextTable;
    }

    public Vector<String> removeMarkup(String page) {
        StringBuffer str = new StringBuffer("");
        StringBuffer line = null;
        Vector<String> textList = new Vector<String>();

        boolean endOfText = false;
        Scanner s = null;
        try {
            s = new Scanner(page);
            while (s.hasNext() && !endOfText) {

                line = new StringBuffer(s.nextLine());
                // process text until it finds any of these labels:
                if (line.indexOf("==References") >= 0 || line.indexOf("== References") >= 0
                        || line.indexOf("==See also") >= 0 || line.indexOf("== See also") >= 0
                        || line.indexOf("==External links and sources") >= 0
                        || line.indexOf("==External links") >= 0 || line.indexOf("== External links") >= 0
                        || line.indexOf("== External Links") >= 0
                        || line.indexOf("== External links and sources") >= 0 || line.indexOf("==Notes") >= 0
                        || line.indexOf("== Notes") >= 0 || line.indexOf("==Sources") >= 0
                        || line.indexOf("== Sources") >= 0 || line.indexOf("==Foreign") >= 0
                        || line.indexOf("== Foreign") >= 0 || line.indexOf("==Discussion") >= 0) {
                    endOfText = true;
                } else {
                    // when removing sections it might add more lines that might contain again more labels to remove
                    boolean clean = false;
                    while (!clean && line.length() > 0) {
                        clean = true;
                        if (line.indexOf("<noinclude") >= 0) {
                            line = removeSection(s, line, "<noinclude", "</noinclude>");
                            clean = false;
                        }

                        if (line.indexOf("<includeonly") >= 0) {
                            line = removeSection(s, line, "<includeonly", "</includeonly>");
                            clean = false;
                        }

                        if (line.indexOf("<onlyinclude") >= 0) {
                            line = removeSection(s, line, "<onlyinclude", "</onlyinclude>");
                            clean = false;
                        }

                        if (line.indexOf("<table") >= 0) { // tables
                            line = removeSection(s, line, "<table", "</table>");
                            clean = false;
                        }

                        if (line.indexOf("<TABLE") >= 0) {
                            line = removeSection(s, line, "<TABLE", "</TABLE>");
                            clean = false;
                        }

                        if (line.indexOf("{{col-begin}}") >= 0) {
                            line = removeSection(s, line, "{{col-begin}}", "{{col-end}}");
                            clean = false;
                        }

                        if (line.indexOf("{|") >= 0) { // this is a table, this should go before {{ because a table can contain {{ }}
                            line = removeSectionTable(s, line, "{|", "|}");
                            clean = false;
                        }

                        if (line.indexOf("<ref") >= 0) { // references
                            line = removeSectionRef(s, line); // This is special because it can be <ref>, <ref, </ref> or />
                            clean = false;
                        }

                        if (line.indexOf("<REF") >= 0) {
                            line = removeSection(s, line, "<REF", "</REF>");
                            clean = false;
                        }

                        if (line.indexOf("<Ref") >= 0) {
                            line = removeSection(s, line, "<Ref", "</Ref>");
                            clean = false;
                        }
                        if (line.indexOf("<reF") >= 0) {
                            line = removeSection(s, line, "<reF", "</reF>");
                            clean = false;
                        }

                        if (line.indexOf("{{start box}}") >= 0) {
                            line = removeSection(s, line, "{{start box}}", "{{end box}}");
                            clean = false;
                        }

                        if (line.indexOf("{{") >= 0) {
                            line = removeSection(s, line, "{{", "}}");
                            clean = false;
                        }

                        if (line.indexOf("<!--") >= 0) {
                            line = removeSection(s, line, "<!--", "-->");
                            clean = false;
                        }

                        if (line.indexOf("\\mathrel{|") >= 0) {
                            line = removeSection(s, line, "\\mathrel{|", "}");
                            clean = false;
                        }

                        if (line.indexOf("<gallery") >= 0) { // gallery might contain several images
                            line = removeSection(s, line, "<gallery", "</gallery>");
                            clean = false;
                        }

                        if (line.indexOf("[[Image:") >= 0) {
                            line = removeSectionImage(s, line, "[[Image:", "]]");
                            clean = false;
                        }

                        if (line.indexOf("<div") >= 0) { // span and div tags are used to separate images from text
                            line = removeSection(s, line, "<div", "</div>");
                            clean = false;
                        }

                        if (line.indexOf("<DIV") >= 0) {
                            line = removeSectionImage(s, line, "<DIV", "</DIV>");
                            clean = false;
                        }

                        if (line.indexOf("<span") >= 0) {
                            line = removeSection(s, line, "<span", "</span>");
                            clean = false;
                        }

                        if (line.indexOf("<math>") >= 0) {
                            line = removeSection(s, line, "<math>", "</math>");
                            clean = false;
                        }

                        if (line.indexOf("<timeline>") >= 0) {
                            line = removeSection(s, line, "<timeline>", "</timeline>");
                            clean = false;
                        }

                        if (line.indexOf("<nowiki") >= 0) {
                            line = removeSection(s, line, "<nowiki", "</nowiki>");
                            clean = false;
                        }

                        if (line.indexOf("<source") >= 0) {
                            line = removeSection(s, line, "<source", "</source>");
                            clean = false;
                        }

                        if (line.indexOf("<code") >= 0) {
                            line = removeSection(s, line, "<code", "</code>");
                            clean = false;
                        }

                        if (line.indexOf("<imagemap") >= 0) {
                            line = removeSection(s, line, "<imagemap", "</imagemap>");
                            clean = false;
                        }

                        if (line.indexOf("<poem") >= 0) {
                            line = removeSection(s, line, "<poem", "</poem>");
                            clean = false;
                        }

                        if (line.indexOf("<h1") >= 0) {
                            line = removeSection(s, line, "<h1", "</h1>");
                            clean = false;
                        }

                        if (line.indexOf("<pre") >= 0) {
                            line = removeSection(s, line, "<pre", "</pre>");
                            clean = false;
                        }

                    } // while the line/text is not clean (or does not have tags to remove)

                    // here filter bulleted and numbered short lines
                    if (line.length() > 0) {
                        if ((line.toString().startsWith("*") || line.toString().startsWith("#")
                                || line.toString().startsWith(";") || line.toString().startsWith(".")
                                || line.toString().startsWith(",") || line.toString().startsWith("&")
                                || line.toString().startsWith("}") || line.toString().startsWith("]")
                                || line.toString().startsWith("|") || line.toString().startsWith("ca:")
                                || line.toString().startsWith("cs:") || line.toString().startsWith("de:")
                                || line.toString().startsWith("es:") || line.toString().startsWith("fr:")
                                || line.toString().startsWith("it:") || line.toString().startsWith("hu:")
                                || line.toString().startsWith("ja:") || line.toString().startsWith("no:")
                                || line.toString().startsWith("pt:") || line.toString().startsWith("sl:")
                                || line.toString().startsWith("fi:") || line.toString().startsWith("sv:")
                                || line.toString().startsWith("tr:") || line.toString().startsWith("zh:")
                                || line.toString().startsWith("Category:") || line.toString().startsWith("!style=")
                                || line.toString().startsWith("!  style=") || line.toString().startsWith("!align=")
                                || line.toString().startsWith("::<code") || line.toString().endsWith("]]"))
                                && line.length() < 200)
                            line = new StringBuffer("");
                    }
                    // Now if the line is not empty, remove:
                    //   '''''bold & italic'''''
                    //   '''bold'''
                    //   ''italic''
                    // Internal links: 
                    //   [[Name of page]]
                    //   [[Name of page|Text to display]]
                    // External links:
                    //   [http://www.example.org Text to display]
                    //   [http://www.example.org]
                    //    http://www.example.org
                    if (line.length() > 0) {

                        line = new StringBuffer(line.toString().replaceAll("'''''", ""));
                        line = new StringBuffer(line.toString().replaceAll("'''", ""));
                        line = new StringBuffer(line.toString().replaceAll("''", ""));

                        line = processInternalAndExternalLinks(line);

                        // this will convert HTML &nbsp; &ndash; etc. 
                        String strlineNoHTML = StringEscapeUtils.unescapeHtml(line.toString());
                        line = new StringBuffer(strlineNoHTML);

                        // The previous does not remove all HTML stuff, so here it is done some manually
                        line = new StringBuffer(line.toString().replaceAll("<big>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</big>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<blockquote>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</blockquote>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<BLOCKQUOTE>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</BLOCKQUOTE>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<sup>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</sup>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<sub>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</sub>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<small>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</small>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<ul>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</ul>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<UL>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</UL>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<br>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<br", ""));
                        line = new StringBuffer(line.toString().replaceAll("<BR>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<br", ""));
                        line = new StringBuffer(line.toString().replaceAll("<br/>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<Center>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<center>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</center>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<CENTER>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</CENTER>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<cite>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</cite>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<li>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</li>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<LI>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</LI>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<dl>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</dl>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<dt>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</dt>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<dd>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</dd>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<b>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</b>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<p>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</p>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<u>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</u>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<tt>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</tt>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<i>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</i>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<I>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</I>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<s>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</s>", ""));
                        line = new StringBuffer(line.toString().replaceAll("<em>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</em>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</br>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</div>", ""));
                        line = new StringBuffer(line.toString().replaceAll("</ref>", ""));
                        line = new StringBuffer(line.toString().replaceAll("/>", ""));

                        // Removing quotation marks
                        line = new StringBuffer(line.toString().replaceAll("\"", ""));
                        // these quotations have a strange/problematic symbol different from "
                        line = new StringBuffer(line.toString().replaceAll("", ""));
                        line = new StringBuffer(line.toString().replaceAll("?", ""));
                        // these symbol are also problematic, here they are changed.
                        line = new StringBuffer(line.toString().replaceAll("", "'"));
                        line = new StringBuffer(line.toString().replaceAll("", "-"));
                        line = new StringBuffer(line.toString().replaceAll("", "-"));

                        line = new StringBuffer(line.toString().replaceAll("", " "));
                        line = new StringBuffer(line.toString().replaceAll("", " "));

                        // finally sections and lists
                        boolean is_title = false;
                        if (line.toString().startsWith("==")) {
                            is_title = true;
                        }
                        line = new StringBuffer(line.toString().replaceAll("\\s*==+$|==+", ""));
                        if (is_title) {
                            line.append(".");
                        }

                        // bulleted list and numbered list
                        if (line.toString().startsWith("***") || line.toString().startsWith("*#*"))
                            line.replace(0, 3, "");
                        if (line.toString().startsWith("**") || line.toString().startsWith(":*")
                                || line.toString().startsWith("*#") || line.toString().startsWith("##")
                                || line.toString().startsWith("::"))
                            line.replace(0, 2, "");
                        if (line.toString().startsWith("*") || line.toString().startsWith("#"))
                            line.replace(0, 1, "");
                        if (line.toString().startsWith(";") || line.toString().startsWith(";")) // in glossaries definitions start with ;
                            line.replace(0, 1, "");

                        // remove this when the text is almost clean
                        if (line.indexOf("<font") >= 0)
                            line = removeSection(s, line, "<font", ">");
                        line = new StringBuffer(line.toString().replaceAll("</font>", ""));

                        if (line.indexOf("<blockquote") >= 0)
                            line = removeSection(s, line, "<blockquote", ">");

                        if (line.indexOf("<ol") >= 0)
                            line = removeSection(s, line, "<ol", ">");

                        if (line.indexOf("<http:") >= 0)
                            line = removeSection(s, line, "<http:", ">");

                        // finally concatenate the line  
                        str.append(line);
                        if (!str.toString().endsWith("\n"))
                            str.append("\n");

                        line = null;

                        // check length of the text 
                        if (str.length() > maxTextLength) {
                            textList.add(str.toString());
                            //System.out.println("\n-----------\n" + str.toString());
                            str = new StringBuffer("");
                        }

                    }

                } // endOfText=false

            } // while has more lines

        } finally {
            if (s != null)
                s.close();
        }

        if (!str.toString().contentEquals(""))
            textList.add(str.toString());
        return textList;
    }

    // This is special because it can be:
    // <ref> ... </ref>
    // <ref  ... </ref>
    // <ref  ... />
    private StringBuffer removeSectionRef(Scanner s, StringBuffer lineIn) {
        String next;
        int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0;
        boolean closeRef = true;
        StringBuffer line = new StringBuffer(lineIn);
        StringBuffer nextLine;

        while ((index1 = line.indexOf("<ref")) >= 0) { // in one line can be more than one reference
            numRef++;
            if ((index2 = line.indexOf("</ref>", index1)) >= 0)
                endTagLength = 6 + index2;
            else if ((index3 = line.indexOf("/>", index1)) >= 0)
                endTagLength = 2 + index3;

            if (index2 == -1 && index3 == -1) {// the </ref> most be in the next lines, so get more lines until the </ref> is found
                while (s.hasNext() && numRef != 0) {
                    nextLine = new StringBuffer(s.nextLine());
                    if (nextLine.indexOf("<ref") >= 0)
                        numRef++;
                    line.append(nextLine);
                    if ((index2 = line.indexOf("</ref>", index1)) >= 0) {
                        numRef--;
                        endTagLength = 6 + index2;
                    } else if ((index3 = line.indexOf("/>", index1)) >= 0) {
                        numRef--;
                        endTagLength = 2 + index3;
                    }
                }

            } else // the endTag was found
                numRef--;

            if (numRef == 0) {
                index1 = line.indexOf("<ref"); // get again this because the position might change
                if (endTagLength > index1) {
                    line.delete(index1, endTagLength);
                    //System.out.println("nextline="+line);
                } else {
                    if (debug) {
                        System.out.print("iniTag: <ref  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("removeSectionRef: WARNING endTagLength > length of line: " + line);
                        //line.delete(index1, line.length());
                    }
                    line = new StringBuffer("");
                }
            } else {
                if (debug)
                    System.out.println("removeSectionRef: WARNING no </ref> or /> in " + line);
                //line.delete(index1, line.length());
                line = new StringBuffer("");
            }

        } // while this line contains iniTag-s

        return line;

    }

    private StringBuffer removeSection(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
        String next;
        int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0;
        boolean closeRef = true;
        StringBuffer line = new StringBuffer(lineIn);
        StringBuffer nextLine;

        if (debug)
            System.out.println("Removing tag: " + iniTag + "  LINE (BEFORE): " + line);

        while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag

            numRef++;
            if ((index2 = line.indexOf(endTag, index1)) >= 0)
                endTagLength = endTag.length() + index2;

            if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found
                lastEndTag = 0; // start to look for the endTag in 0

                while (s.hasNext() && numRef != 0) {
                    lastIniTag = 0;
                    nextLine = new StringBuffer(s.nextLine());
                    //if(debug)
                    //  System.out.println("  NEXTLINE: " + nextLine);

                    while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) {
                        numRef++;
                        lastIniTag = iniTag.length() + index1;
                    }

                    line.append(nextLine);

                    // next time it will look for the endTag after the position of the last it found.
                    while ((index2 = line.indexOf(endTag, lastEndTag)) >= 0) {
                        numRef--;
                        lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found
                        endTagLength = endTag.length() + index2;
                    }

                    //if(debug)
                    //  System.out.println("LINE (numRef=" + numRef + "): " + line);
                }
            } else // the endTag was found
                numRef--;

            if (numRef == 0) {
                index1 = line.indexOf(iniTag); // get again this because the position might change
                if (endTagLength > index1) {
                    if (debug) {
                        System.out.println("    FINAL LINE: " + line);
                        System.out.print("iniTag: " + iniTag + "  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("  line.length=" + line.length());
                    }
                    line.delete(index1, endTagLength);
                } else {
                    if (debug) {
                        System.out.println("removeSection: WARNING endTagLength > length of line: ");
                        System.out.print("iniTag: " + iniTag + "  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
                    }
                    line = new StringBuffer("");
                }

                //System.out.println("nextline="+line);
            } else {
                if (debug)
                    System.out.println("removeSection: WARNING no " + endTag);
                line = new StringBuffer("");
            }

        } // while this line contains iniTag-s

        if (debug)
            System.out.println("    LINE (AFTER): " + line);
        return line;
    }

    private StringBuffer removeSectionTable(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
        String next;
        int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0;
        boolean closeRef = true;
        StringBuffer line = new StringBuffer(lineIn);
        StringBuffer nextLine;

        if (debug)
            System.out.println("Removing tag: " + iniTag + "  LINE (BEFORE): " + line);

        while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag

            numRef++;
            if ((index2 = line.indexOf(endTag, index1)) >= 0)
                endTagLength = endTag.length() + index2;

            if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found
                lastEndTag = 0; // start to look for the endTag in 0

                while (s.hasNext() && numRef != 0) {
                    lastIniTag = 0;
                    nextLine = new StringBuffer(s.nextLine());
                    //if(debug)
                    //  System.out.println("  NEXTLINE: " + nextLine);

                    while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) {
                        numRef++;
                        lastIniTag = iniTag.length() + index1;
                    }
                    // next time it will look for the endTag after the position of the last it found.
                    //while( (index2 = line.indexOf(endTag, lastEndTag)) >= 0 ){
                    if (nextLine.toString().startsWith(endTag)) {
                        numRef--;
                        //index2 = line.length();
                        //lastEndTag = index2 + endTag.length();  // I need to remember where the last endTag was found
                        endTagLength = line.length() + endTag.length();
                    }

                    line.append(nextLine);

                    //if(debug)
                    //  System.out.println("LINE (numRef=" + numRef + "): " + line);
                }
            } else // the endTag was found
                numRef--;

            if (numRef == 0) {
                index1 = line.indexOf(iniTag); // get again this because the position might change
                if (endTagLength > index1) {
                    if (debug) {
                        System.out.println("    FINAL LINE: " + line);
                        System.out.print("iniTag: " + iniTag + "  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("  line.length=" + line.length());
                    }
                    line.delete(index1, endTagLength);
                } else {
                    if (debug) {
                        System.out.println("removeSection: WARNING endTagLength > length of line: ");
                        System.out.print("iniTag: " + iniTag + "  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
                    }
                    line = new StringBuffer("");
                }

                //System.out.println("nextline="+line);
            } else {
                if (debug)
                    System.out.println("removeSection: WARNING no " + endTag);
                line = new StringBuffer("");
            }

        } // while this line contains iniTag-s

        if (debug)
            System.out.println("    LINE (AFTER): " + line);
        return line;
    }

    /****
     * This is also special because the line might contain sections with [[ ...  ]] so the ]] after a [[
     * is not the endTag of [[image:  ... ]]
     * @param s
     * @param lineIn
     * @param iniTag
     * @param endTag
     * @param debug
     * @return
     */
    private StringBuffer removeSectionImage(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
        String next;
        int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0, lastEndTag1 = 0, lastIniTag = 0;
        boolean closeRef = true;
        StringBuffer line = new StringBuffer(lineIn);
        StringBuffer nextLine;
        StringBuffer aux;

        if (debug)
            System.out.println("Removing tag: " + iniTag + "  LINE (BEFORE): " + line);

        while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag

            numRef++;
            index3 = endTagLength = index1;

            while (s.hasNext() && numRef > 0) {

                while ((index2 = line.indexOf("]]", endTagLength)) >= 0 && numRef > 0) {
                    aux = new StringBuffer(line.subSequence(index1 + 2, index2 + 2));
                    if (debug)
                        System.out.println("    aux=" + aux);
                    if ((index3 = aux.indexOf("[[")) == -1) {
                        endTagLength = endTag.length() + index2;
                        numRef--;
                    } else { // The previous was a [[ ]] inside of a [[Image: so it has to be deleted
                        index1 = index2;
                        endTagLength = index2 + 2;
                        index2 = -1;
                    }
                }
                // so far it has not found the endTag, so get another line
                if (numRef > 0)
                    line.append(s.nextLine());
            }

            if (numRef == 0) {
                index1 = line.indexOf(iniTag); // get again this because the position might change
                if (endTagLength > index1) {
                    if (debug) {
                        System.out.println("    FINAL LINE: " + line);
                        System.out.print("iniTag: " + iniTag + "  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("  line.length=" + line.length());
                    }
                    line.delete(index1, endTagLength);
                } else {
                    if (debug) {
                        System.out.println("removeSection: WARNING endTagLength > length of line: ");
                        System.out.print("iniTag: " + iniTag + "  index1=" + index1);
                        System.out.print("  endTagLength=" + endTagLength);
                        System.out.println("  line.length=" + line.length() + "  line: " + line);
                        System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
                    }
                    line = new StringBuffer("");
                }

            } else {
                if (debug)
                    System.out.println("removeSection: WARNING no " + endTag);
                line = new StringBuffer("");
            }

        } // while this line contains iniTag-s

        if (debug)
            System.out.println("    LINE (AFTER): " + line);
        return line;
    }

    /***
     * Internal links: 
     *  [[Name of page]]
     *  [[Name of page|Text to display]]
     * External links:
     *  [http://www.example.org Text to display]
     *  [http://www.example.org]
     *  http://www.example.org
     */
    private StringBuffer processInternalAndExternalLinks(StringBuffer line) {
        int index1, index2, index3;
        StringBuffer linetmp = null; // for debugging
        boolean changed = false;
        if (debug)
            linetmp = new StringBuffer(line);

        // Internal links:
        while ((index1 = line.indexOf("[[")) >= 0) {
            changed = true;
            if ((index2 = line.indexOf("]]")) >= 0) {
                if ((index3 = line.indexOf("|", index1)) >= 0 && index3 < index2) { // if there is text to display
                    line.delete(index1, index3 + 1); // delete the link and [[ ]]
                    index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]]
                    line.delete(index2, index2 + 2);
                } else {
                    line.delete(index1, index1 + 2); // delete the [[ 
                    index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]]
                    line.delete(index2, index2 + 2); // delete the ]]  -2 because in the previous line i deleted two chars
                }
                //if(debug)
                //  System.out.println("LINE (AFTER): " + line);    

            } else {
                if (debug) {
                    System.out.println("processInternalAndExternalLinks: WARNING no ]] tag in " + line);
                    System.out.println("deleting [[");
                }
                line.delete(index1, index1 + 2); // delete the [[
            }
        }

        // External links: just the ones started with [http: and here I am deleting the whole reference
        // i am not keeping the text to display of this link.
        while ((index1 = line.indexOf("[http:")) >= 0 || (index1 = line.indexOf("[https:")) >= 0) {
            //System.out.println("LINE(BEFORE): " + line); 
            if ((index2 = line.indexOf("]", index1)) >= 0) {
                //   line.delete(index1, index2+1);
                if ((index3 = line.indexOf(" ", index1)) >= 0 && index3 < index2) { // if there is text to display
                    line.delete(index1, index3 + 1); // delete the link and [http:    until first black space before ]
                    index2 = line.indexOf("]"); // since i delete some text i need to find again the next ]]
                    line.delete(index2, index2 + 1);
                } else {
                    line.delete(index1, index2 + 1); // no text to display, delete the whole ref
                }

                //System.out.println("LINE (AFTER): " + line + "\n");    

            } else {
                if (debug) {
                    System.out.println(
                            "processInternalAndExternalLinks: WARNING no ] tag when processing lines with http: line="
                                    + line);
                    System.out.println("deleting [");
                }
                line.delete(index1, index1 + 1); // delete the [
            }
        }

        if (debug && changed) {
            System.out.println("Removing links, LINE(BEFORE): " + linetmp);
            System.out.println("                LINE (AFTER): " + line);
        }

        return line;

    }

    public void addWordToHashMap(String text, HashMap<String, Integer> wordList) {
        String sentences[];
        String words[], w;
        Integer i;
        int m, n;

        sentences = text.split("\n");
        for (m = 0; m < sentences.length; m++) {
            //System.out.println("\n" + sentences[m]);
            words = sentences[m].split(" ");
            for (n = 0; n < words.length; n++) {
                w = words[n];
                //System.out.print("word=" + words[n] + "   -->");
                // Split into letter sections that we will consider atomic "words":
                int start = 0, end = 0;
                int minimumLength = 2;
                for (; end < w.length(); end++) {
                    //if (Character.isLetter(w.charAt(end))) {
                    if (marytts.util.string.StringUtils.isLetterOrModifier(w.codePointAt(end))) {
                        if (start < 0)
                            start = end;
                        continue;
                    }
                    // not a letter
                    if (start >= 0 && end - start >= minimumLength) {
                        String oneWord = w.substring(start, end);
                        //System.out.print(" oneWord1=" + oneWord);
                        Integer count = (Integer) wordList.get(oneWord);
                        // if key is not in the map then give it value one
                        // otherwise increment its value by 1
                        if (count == null)
                            wordList.put(oneWord, new Integer(1));
                        else
                            wordList.put(oneWord, new Integer(count.intValue() + 1));
                    }
                    start = -1;
                }
                if (start >= 0 && end - start >= minimumLength) {
                    String oneWord = w.substring(start, end);
                    //System.out.print(" oneWord2=" + oneWord);
                    Integer count = (Integer) wordList.get(oneWord);
                    // if key is not in the map then give it value one
                    // otherwise increment its value by 1
                    if (count == null)
                        wordList.put(oneWord, new Integer(1));
                    else
                        wordList.put(oneWord, new Integer(count.intValue() + 1));
                }
                /*            
                            // remove punctuation
                            if( w.endsWith(",") || w.endsWith(";") || w.endsWith(".") ||
                                w.endsWith(":") || w.endsWith("'") || w.endsWith(")") || w.endsWith("?") )  
                              w = w.substring(0, (w.length()-1));
                            if( w.endsWith("'s") )
                              w = w.substring(0, (w.length()-2));      
                            if(w.startsWith("(") )
                              w = w.substring(1, w.length());
                    
                            if( w.length()>1 && StringUtils.isAlpha(w) && StringUtils.isNotBlank(w) 
                         && StringUtils.isNotEmpty(w) && StringUtils.isAsciiPrintable(w)) {
                              //System.out.print(w + " ");
                              i = (Integer) wordList.get(w);
                              // if key is not in the map then give it value one
                              // otherwise increment its value by 1
                              if(i==null)
                                wordList.put(w, new Integer(1));
                              else
                                wordList.put(w, new Integer( i.intValue() + 1));
                            } // if word is > 1 and isAlpha
                */
                // System.out.println("\n");
            }
            //System.out.println("\n");
            words = null;
        }
        sentences = null;
    }

    public void updateWordList(DBHandler wikiToDB, HashMap<String, Integer> wlNew) {
        String w;
        HashMap<String, Integer> wlOld;
        Integer freq;
        Integer i;

        // Checking if word list exist
        if (wikiToDB.tableExist(locale + "_wordList")) {
            System.out.println("Updating " + locale + "_wordList in DB table....");
            wlOld = wikiToDB.getMostFrequentWords(0, 0);

            // combine the two tables
            Iterator iterator = wlNew.keySet().iterator();
            while (iterator.hasNext()) {
                w = iterator.next().toString();
                freq = wlNew.get(w);

                i = (Integer) wlOld.get(w);
                // if key is not in the map then give it value freq
                // otherwise increment its value by freq
                if (i == null)
                    wlOld.put(w, new Integer(freq));
                else
                    wlOld.put(w, new Integer(i.intValue() + freq));
            }
            wikiToDB.insertWordList(wlOld);
            System.out.println(
                    "Final size of wordList after combining old and new lists: wordList=[" + wlOld.size() + "]");

        } else {
            System.out.println("Saving " + locale + "_wordList table....");
            wikiToDB.insertWordList(wlNew);
        }

    }

    void processWikipediaSQLTablesDebug() throws Exception {

        DBHandler wikiToDB = new DBHandler(locale);

        wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
        String text;
        StringBuilder textId = new StringBuilder();
        int numPagesUsed = 0;

        PrintWriter pw = null;
        if (wikiLog != null)
            pw = new PrintWriter(new FileWriter(new File(wikiLog)));

        // get text from the DB
        text = wikiToDB.getTextFromWikiPage(debugPageId, minPageLength, textId, pw);
        System.out.println("\nPAGE SIZE=" + text.length() + "  text:\n" + text);

        Vector<String> textList;

        if (text != null) {
            textList = removeMarkup(text);
            System.out.println("\nCLEANED TEXT:");
            for (int i = 0; i < textList.size(); i++)
                System.out.println("text(" + i + "): \n" + textList.get(i));

        } else
            System.out.println("NO CLEANED TEXT.");

        if (pw != null)
            pw.close();

        wikiToDB.closeDBConnection();

    }

    /***
     * Using mwdumper extracts pages from a xmlWikiFile and load them in a mysql DB (it loads the
     * tables "locale_text", "locale_page" and "locale_revision", where locale is the corresponding 
     * wikipedia language). Once the tables are loaded, extract/clean text from the pages and create
     * a cleanText table. It also creates a wordList table including frequencies.  
     * @throws Exception
     */
    void processWikipediaPages() throws Exception {
        // Load wikipedia pages, extract clean text and  create word list.
        String dateStringIni = "", dateStringEnd = "";
        DateFormat fullDate = new SimpleDateFormat("dd_MM_yyyy_HH:mm:ss");
        Date dateIni = new Date();
        dateStringIni = fullDate.format(dateIni);

        DBHandler wikiToDB = new DBHandler(locale);

        // hashMap for the dictionary, HashMap is faster than TreeMap so the list of words will
        // be kept it in a hashMap. When the process finish the hashMap will be dump in the database.
        HashMap<String, Integer> wordList;

        System.out.println("Creating connection to DB server...");
        wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);

        // This loading can take a while
        // create and load TABLES: page, text and revision

        if (loadWikiTables) {
            System.out.println(
                    "Creating and loading TABLES: page, text and revision. (The loading can take a while...)");
            wikiToDB.loadPagesWithMWDumper(xmlWikiFile, locale, mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
        } else {
            // Checking if tables are already created and loaded in the DB
            if (wikiToDB.checkWikipediaTables())
                System.out.println("TABLES " + locale + "_page, " + locale + "_text and " + locale
                        + "_revision already loaded (WARNING USING EXISTING WIKIPEDIA TABLES).");
            else
                throw new Exception("WikipediaMarkupCleaner: ERROR IN TABLES " + locale + "_page, " + locale
                        + "_text and " + locale + "_revision, they are not CREATED/LOADED.");
        }

        System.out.println("\nGetting page IDs");
        String pageId[];
        pageId = wikiToDB.getIds("page_id", locale + "_page");
        System.out.println("Number of page IDs to process: " + pageId.length + "\n");

        // create cleanText TABLE
        if (deleteCleanTextTable) {
            System.out.println("Creating (deleting if already exist) " + locale + "_cleanText TABLE");
            wikiToDB.createWikipediaCleanTextTable();
        } else {
            if (wikiToDB.tableExist(locale + "_cleanText"))
                System.out.println(locale + "_cleanText TABLE already exist (ADDING TO EXISTING cleanText TABLE)");
            else {
                System.out.println("Creating " + locale + "_cleanText TABLE");
                wikiToDB.createWikipediaCleanTextTable();
            }
        }

        System.out.println("Starting Hashtable for wordList.");
        int initialCapacity = 200000;
        wordList = new HashMap<String, Integer>(initialCapacity);

        String text;
        PrintWriter pw = null;
        if (wikiLog != null)
            pw = new PrintWriter(new FileWriter(new File(wikiLog)));

        StringBuilder textId = new StringBuilder();
        int numPagesUsed = 0;

        Vector<String> textList;
        System.out.println("\nStart processing Wikipedia pages.... Start time:" + dateStringIni + "\n");

        for (int i = 0; i < pageId.length; i++) {

            // first filter  
            text = wikiToDB.getTextFromWikiPage(pageId[i], minPageLength, textId, pw);

            if (text != null) {
                textList = removeMarkup(text);
                numPagesUsed++;
                for (int j = 0; j < textList.size(); j++) {
                    text = textList.get(j);
                    if (text.length() > minTextLength) {
                        // if after cleaning the text is not empty or                 
                        wikiToDB.insertCleanText(text, pageId[i], textId.toString());
                        // insert the words in text in wordlist
                        addWordToHashMap(text, wordList);
                        if (debug)
                            System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + "  textList (" + (j + 1)
                                    + "/" + textList.size() + ") length=" + text.length() + "  numPagesUsed="
                                    + numPagesUsed + "  Wordlist[" + wordList.size() + "] ");

                        if (pw != null)
                            pw.println("CLEANED PAGE page_id[" + i + "]=" + pageId[i] + " textList (" + (j + 1)
                                    + "/" + textList.size() + ") length=" + text.length() + " Wordlist["
                                    + wordList.size() + "] " + "  NUM_PAGES_USED=" + numPagesUsed + " text:\n\n"
                                    + text);
                    } else if (pw != null)
                        pw.println("PAGE NOT USED AFTER CLEANING page_id[" + i + "]=" + pageId[i] + " length="
                                + text.length());
                } // for each text in textList
                System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + "  numPagesUsed=" + numPagesUsed
                        + "  Wordlist[" + wordList.size() + "] ");
                textList.clear(); // clear the list of text
            }
        }
        Date dateEnd = new Date();
        dateStringEnd = fullDate.format(dateEnd);

        if (pw != null) {
            pw.println("Number of PAGES USED=" + numPagesUsed + " Wordlist[" + wordList.size() + "] "
                    + " minPageLength=" + minPageLength + " minTextLength=" + minTextLength + " Start time:"
                    + dateStringIni + "  End time:" + dateStringEnd);
            pw.close();
        }

        // save the wordList in the DB
        updateWordList(wikiToDB, wordList);

        wikiToDB.printWordList("./wordlist-freq.txt", "frequency", 0, 0);

        System.out.println("\nNumber of pages used=" + numPagesUsed + " Wordlist[" + wordList.size() + "] "
                + " Start time:" + dateStringIni + "  End time:" + dateStringEnd);

        // Once created the cleantext table delete the wikipedia text, page and revision tables.
        wikiToDB.deleteWikipediaTables();

        wikiToDB.closeDBConnection();

    }

    private void printParameters() {
        System.out.println("WikipediaMarkupCleaner parameters:" + "\n  -mysqlHost " + getMysqlHost()
                + "\n  -mysqlUser " + getMysqlUser() + "\n  -mysqlPasswd " + getMysqlPasswd() + "\n  -mysqlDB "
                + getMysqlDB() + "\n  -xmlFile " + getXmlWikiFile() + "\n  -minPage " + getMinPageLength()
                + "\n  -minText " + getMinTextLength() + "\n  -maxText " + getMaxTextLength() + "\n  -log "
                + getWikiLog() + "\n  -debugPageId " + getTestId());

        if (getDebug())
            System.out.println("  -debug true");
        else
            System.out.println("  -debug false");
        if (getLoadWikiTables())
            System.out.println("  -loadWikiTables true");
        else
            System.out.println("  -loadWikiTables false");
        if (getDeleteCleanTextTable())
            System.out.println("  -deleteCleanTextTable true\n");
        else
            System.out.println("  -deleteCleanTextTable false\n");
    }

    //
    /**
     * Read and parse the command line args
     * 
     * @param args the args
     * @return true, if successful, false otherwise
     */
    private boolean readArgs(String[] args) {

        String help = "\nUsage: java WikipediaMarkupCleaner -locale language -mysqlHost host -mysqlUser user  \n"
                + "                       -mysqlPasswd passwd -mysqlDB wikiDB -xmlFile xmlWikiFile \n"
                + "      default/optional: [-minPage 10000 -minText 1000 -maxText 15000] \n"
                + "      optional: [-log wikiLogFile -id pageId -debug]\n\n"
                + "      -minPage is the minimum size of a wikipedia page that will be considered for cleaning.\n"
                + "      -minText is the minimum size of a text to be kept in the DB.\n"
                + "      -maxText is used to split big articles in small chunks, this is the maximum chunk size. \n"
                + "      -log the wikiLogFile will contain the cleaned text and information about the pages used.\n"
                + "      -debug will produce more output and it is mainly used to debug a particular Wikipedia page.\n"
                + "      -debugPageId is the page_id number in a wikipedia page table (ex. 18702442), when used this option\n"
                + "           the tables will not be loaded, so it is asumed that page, text and revision tables are already loaded.\n"
                + "      -noLoadWikiTables use this variable to save time NOT loading wiki tables, they must already exist in the the DB.\n"
                + "      -noDeleteCleanTextTable use this variable to do NOT create a new cleanText table, but adding to an already existing\n"
                + "       cleanText table.\n";

        if (args.length >= 12) { // minimum 12 parameters
            for (int i = 0; i < args.length; i++) {
                if (args[i].contentEquals("-locale") && args.length >= (i + 1))
                    setLocale(args[++i]);

                else if (args[i].contentEquals("-mysqlHost") && args.length >= (i + 1))
                    setMysqlHost(args[++i]);

                else if (args[i].contentEquals("-mysqlUser") && args.length >= (i + 1))
                    setMysqlUser(args[++i]);

                else if (args[i].contentEquals("-mysqlPasswd") && args.length >= (i + 1))
                    setMysqlPasswd(args[++i]);

                else if (args[i].contentEquals("-mysqlDB") && args.length >= (i + 1))
                    setMysqlDB(args[++i]);

                else if (args[i].contentEquals("-xmlFile") && args.length >= (i + 1))
                    setXmlWikiFile(args[++i]);

                // From here the arguments are optional
                else if (args[i].contentEquals("-minPage") && args.length >= (i + 1))
                    setMinPageLength(Integer.parseInt(args[++i]));

                else if (args[i].contentEquals("-minText") && args.length >= (i + 1))
                    setMinTextLength(Integer.parseInt(args[++i]));

                else if (args[i].contentEquals("-maxText") && args.length >= (i + 1))
                    setMaxTextLength(Integer.parseInt(args[++i]));

                else if (args[i].contentEquals("-log") && args.length >= (i + 1))
                    setWikiLog(args[++i]);

                else if (args[i].contentEquals("-debugPageId") && args.length >= (i + 1))
                    setTestId(args[++i]);

                else if (args[i].contentEquals("-debug"))
                    setDebug(true);

                // Use this variable to save time NOT loading wiki tables, they must already exist in the DB
                else if (args[i].contentEquals("-noLoadWikiTables"))
                    setLoadWikiTables(false);

                //Use this variable to do not create a new cleanText table, but adding to an already existing cleanText table.
                else if (args[i].contentEquals("-noDeleteCleanTextTable"))
                    setDeleteCleanTextTable(false);

                else { //unknown argument
                    System.out.println("\nOption not known: " + args[i]);
                    System.out.println(help);
                    return false;
                }

            }
        } else { // num arguments less than 16
            System.out.println(help);
            return false;
        }

        if (getLocale() == null) {
            System.out.println("\nMissing locale.");
            printParameters();
            System.out.println(help);
            return false;
        }

        if (getMysqlHost() == null || getMysqlUser() == null || getMysqlPasswd() == null || getMysqlDB() == null) {
            System.out.println("\nMissing required mysql parameters (one/several required variables are null).");
            printParameters();
            System.out.println(help);
            return false;
        }

        if (getXmlWikiFile() == null) {
            System.out.println("\nMissing required parameter, the XML wikipedia file\n");
            printParameters();
            System.out.println(help);
            return false;
        }

        return true;
    }

    public static void main(String[] args) throws Exception {

        WikipediaMarkupCleaner wikiCleaner = new WikipediaMarkupCleaner();

        /* check the arguments */
        if (!wikiCleaner.readArgs(args))
            return;

        wikiCleaner.printParameters();

        if (wikiCleaner.getTestId() != null)
            wikiCleaner.processWikipediaSQLTablesDebug();
        else
            wikiCleaner.processWikipediaPages();

    }

}