Java tutorial
/* * MediaWiki import/export processing tools * Copyright 2005 by Brion Vibber * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * $Id: XmlDumpReader.java 59325 2009-11-22 01:21:03Z rainman $ */ package org.mediawiki.importer; import java.io.IOException; import java.io.InputStream; import java.security.NoSuchAlgorithmException; import java.util.Calendar; import java.util.GregorianCalendar; import java.util.HashMap; import java.util.Map; import java.util.TimeZone; import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import com.mysql.jdbc.Connection; import com.mysql.jdbc.Statement; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; //added by philipp.staender@gmail.com //wird benoetigt fuer monogdb+mysql zugriff import com.mongodb.BasicDBObject; import com.mongodb.Mongo; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBCursor; import com.mongodb.DBObject; import java.security.MessageDigest; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.bson.types.ObjectId; public class XmlDumpReader extends DefaultHandler { InputStream input; DumpWriter writer; private char[] buffer; private int len; private boolean hasContent = false; private boolean deleted = false; Siteinfo siteinfo; Page page; boolean pageSent; Contributor contrib; Revision rev; int nskey; boolean abortFlag; //by philipp.staender private DBCollection mongodbArticles = null; private DBCollection mongodbTextindexes = null; private Connection mysqlConnection = null; private String mysqlUsername = "root"; private String mysqlPassword = "root"; private String mysqlHost = "localhost"; private int mysqlPort = 3306; private String mysqlDatabase = "nosql"; private String mongodbDatabasename = "wikipedia"; private int lastInsertedArticleID = 0; private static boolean generateTextIndizes = false; /** * Initialize a processor for a MediaWiki XML dump stream. * Events are sent to a single DumpWriter output sink, but you * can chain multiple output processors with a MultiWriter. * @param inputStream Stream to read XML from. * @param writer Output sink to send processed events to. */ public XmlDumpReader(InputStream inputStream, DumpWriter writer) { input = inputStream; this.writer = writer; buffer = new char[4096]; len = 0; hasContent = false; //by philipp.staender try { //open connection to monogdb an get collections Mongo m = new Mongo(); System.out.println("Initialisiere mongodb Datenbank '" + this.mongodbDatabasename + "'"); m.dropDatabase(this.mongodbDatabasename); DB db = m.getDB(this.mongodbDatabasename); System.out.println("Setze Indizes in mongodb collections..."); this.mongodbArticles = db.getCollection("articles"); this.mongodbArticles.ensureIndex("title"); if (XmlDumpReader.generateTextIndizes) { this.mongodbTextindexes = db.getCollection("textindex"); this.mongodbTextindexes.ensureIndex("title"); this.mongodbTextindexes.ensureIndex("article"); this.mongodbTextindexes.ensureIndex("links"); this.mongodbTextindexes.ensureIndex("sections.subtitle"); } } catch (Exception e) { System.err.println("Fehler beim Initialisieren der mongodb: " + e.getMessage()); System.err.println("Applikation wird beendet"); System.exit(0); } //open connection to mysql String dbURL = "jdbc:mysql://" + this.mysqlHost + ":" + this.mysqlPort + "/" + this.mysqlDatabase; try { this.mysqlConnection = (Connection) DriverManager.getConnection(dbURL, this.mysqlUsername, this.mysqlPassword); //Prepare tables for database System.out.println("Initialisiere notwendige Tabellen in mysql Datenbank '" + this.mysqlDatabase + "'"); Statement stmt = (Statement) this.mysqlConnection.createStatement(); String sql = "DROP TABLE IF EXISTS `articles`;"; stmt.executeUpdate(sql); sql = "CREATE TABLE IF NOT EXISTS `articles` ( `ID` int(11) NOT NULL AUTO_INCREMENT, `MongoID` varchar(255) NOT NULL, `Title` varchar(255) NOT NULL, `Redirect` varchar(255) NOT NULL, `Comment` text NOT NULL, `Content` longtext NOT NULL, `Links` text NOT NULL, PRIMARY KEY (`ID`), KEY `ArticleTitle` (`Title`)) ENGINE=MyISAM DEFAULT CHARSET=utf8;"; stmt.executeUpdate(sql); if (XmlDumpReader.generateTextIndizes) { sql = "DROP TABLE IF EXISTS `textindex`;"; stmt.executeUpdate(sql); sql = "CREATE TABLE IF NOT EXISTS `textindex` ( `ID` int(11) NOT NULL AUTO_INCREMENT, `ArticleID` int(11) NOT NULL, `MongoID` varchar(255) DEFAULT NULL, `Sort` int(11) NOT NULL, `Title` varchar(255) DEFAULT NULL, `Text` longtext, `Links` text NOT NULL, PRIMARY KEY (`ID`), KEY `ArticleID` (`ArticleID`)) ENGINE=MyISAM DEFAULT CHARSET=utf8;"; stmt.executeUpdate(sql); } } catch (SQLException ex) { // handle any errors System.err.println("Fehler beim Initialisieren der mysql Datenbank..."); System.err.println("SQLException: " + ex.getMessage()); System.err.println("SQLState: " + ex.getSQLState()); System.err.println("VendorError: " + ex.getErrorCode()); System.err.println("Applikation wird beendet"); System.exit(0); } } /** * Reads through the entire XML dump on the input stream, sending * events to the DumpWriter as it goes. May throw exceptions on * invalid input or due to problems with the output. * @throws IOException */ public void readDump() throws IOException { try { SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser parser = factory.newSAXParser(); parser.parse(input, this); } catch (ParserConfigurationException e) { throw (IOException) new IOException(e.getMessage()).initCause(e); } catch (SAXException e) { throw (IOException) new IOException(e.getMessage()).initCause(e); } writer.close(); } /** * Request that the dump processing be aborted. * At the next element, an exception will be thrown to stop the XML parser. * @fixme Is setting a bool thread-safe? It should be atomic... */ public void abort() { abortFlag = true; } // -------------------------- // SAX handler interface methods: private static final Map startElements = new HashMap(64); private static final Map endElements = new HashMap(64); static { startElements.put("revision", "revision"); startElements.put("contributor", "contributor"); startElements.put("page", "page"); startElements.put("mediawiki", "mediawiki"); startElements.put("siteinfo", "siteinfo"); startElements.put("namespaces", "namespaces"); startElements.put("namespace", "namespace"); endElements.put("ThreadSubject", "ThreadSubject"); endElements.put("ThreadParent", "ThreadParent"); endElements.put("ThreadAncestor", "ThreadAncestor"); endElements.put("ThreadPage", "ThreadPage"); endElements.put("ThreadID", "ThreadID"); endElements.put("ThreadSummaryPage", "ThreadSummaryPage"); endElements.put("ThreadAuthor", "ThreadAuthor"); endElements.put("ThreadEditStatus", "ThreadEditStatus"); endElements.put("ThreadType", "ThreadType"); endElements.put("base", "base"); endElements.put("case", "case"); endElements.put("comment", "comment"); endElements.put("contributor", "contributor"); endElements.put("generator", "generator"); endElements.put("id", "id"); endElements.put("ip", "ip"); endElements.put("mediawiki", "mediawiki"); endElements.put("minor", "minor"); endElements.put("namespaces", "namespaces"); endElements.put("namespace", "namespace"); endElements.put("page", "page"); endElements.put("restrictions", "restrictions"); endElements.put("revision", "revision"); endElements.put("siteinfo", "siteinfo"); endElements.put("sitename", "sitename"); endElements.put("text", "text"); endElements.put("timestamp", "timestamp"); endElements.put("title", "title"); endElements.put("username", "username"); } public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException { // Clear the buffer for character data; we'll initialize it // if and when character data arrives -- at that point we // have a length. len = 0; hasContent = false; if (abortFlag) throw new SAXException("XmlDumpReader set abort flag."); // check for deleted="deleted", and set deleted flag for the current element. String d = attributes.getValue("deleted"); deleted = (d != null && d.equals("deleted")); try { qName = (String) startElements.get(qName); if (qName == null) return; // frequent tags: if (qName == "revision") openRevision(); else if (qName == "contributor") openContributor(); else if (qName == "page") openPage(); // rare tags: else if (qName == "mediawiki") openMediaWiki(); else if (qName == "siteinfo") openSiteinfo(); else if (qName == "namespaces") openNamespaces(); else if (qName == "namespace") openNamespace(attributes); } catch (IOException e) { throw new SAXException(e); } } public void characters(char[] ch, int start, int length) { if (buffer.length < len + length) { int maxlen = buffer.length * 2; if (maxlen < len + length) maxlen = len + length; char[] tmp = new char[maxlen]; System.arraycopy(buffer, 0, tmp, 0, len); buffer = tmp; } System.arraycopy(ch, start, buffer, len, length); len += length; hasContent = true; } public void endElement(String uri, String localname, String qName) throws SAXException { try { qName = (String) endElements.get(qName); if (qName == null) return; // frequent tags: if (qName == "id") readId(); else if (qName == "revision") closeRevision(); else if (qName == "timestamp") readTimestamp(); else if (qName == "text") readText(); else if (qName == "contributor") closeContributor(); else if (qName == "username") readUsername(); else if (qName == "ip") readIp(); else if (qName == "comment") readComment(); else if (qName == "minor") readMinor(); else if (qName == "page") closePage(); else if (qName == "title") readTitle(); else if (qName == "restrictions") readRestrictions(); // rare tags: else if (qName.startsWith("Thread")) threadAttribute(qName); else if (qName == "mediawiki") closeMediaWiki(); else if (qName == "siteinfo") closeSiteinfo(); else if (qName == "sitename") readSitename(); else if (qName == "base") readBase(); else if (qName == "generator") readGenerator(); else if (qName == "case") readCase(); else if (qName == "namespaces") closeNamespaces(); else if (qName == "namespace") closeNamespace(); // else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")"); } catch (IOException e) { throw (SAXException) new SAXException(e.getMessage()).initCause(e); } } // ---------- void threadAttribute(String attrib) throws IOException { if (attrib.equals("ThreadPage")) // parse title page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces)); else page.DiscussionThreadingInfo.put(attrib, bufferContents()); } void openMediaWiki() throws IOException { siteinfo = null; writer.writeStartWiki(); } void closeMediaWiki() throws IOException { writer.writeEndWiki(); siteinfo = null; } // ------------------ void openSiteinfo() { siteinfo = new Siteinfo(); } void closeSiteinfo() throws IOException { writer.writeSiteinfo(siteinfo); } private String bufferContentsOrNull() { if (!hasContent) return null; else return bufferContents(); } private String bufferContents() { return len == 0 ? "" : new String(buffer, 0, len); } void readSitename() { siteinfo.Sitename = bufferContents(); } void readBase() { siteinfo.Base = bufferContents(); } void readGenerator() { siteinfo.Generator = bufferContents(); } void readCase() { siteinfo.Case = bufferContents(); } void openNamespaces() { siteinfo.Namespaces = new NamespaceSet(); } void openNamespace(Attributes attribs) { nskey = Integer.parseInt(attribs.getValue("key")); } void closeNamespace() { siteinfo.Namespaces.add(nskey, bufferContents()); } void closeNamespaces() { // NOP } // ----------- void openPage() { page = new Page(); pageSent = false; } void closePage() throws IOException { if (pageSent) writer.writeEndPage(); page = null; } void readTitle() { page.Title = new Title(bufferContents(), siteinfo.Namespaces); } void readId() { int id = Integer.parseInt(bufferContents()); if (contrib != null) contrib.Id = id; else if (rev != null) rev.Id = id; else if (page != null) page.Id = id; else throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>"); } void readRestrictions() { page.Restrictions = bufferContents(); } // ------ void openRevision() throws IOException { if (!pageSent) { writer.writeStartPage(page); pageSent = true; } rev = new Revision(); } void closeRevision() throws IOException { //modified by philipp.staender@gmail.com //added by philipp.staender@gmail.com //greife des text ab, und mache ein eigens (einfacheres insert) String comment = "''"; if (rev.Comment != null) comment = sqlEscape(rev.Comment); String text = "''"; if (rev.Text != null) text = sqlEscape(rev.Text); String title = "''"; if (page.Title != null) title = sqlEscape(page.Title.toString()); insertToMongoDBAndMySQL(); rev = null; } static String generateHashForID(String text) { //erstelle hash try { MessageDigest sha = MessageDigest.getInstance("SHA1"); byte[] hash = sha.digest(text.getBytes()); StringBuilder sb = new StringBuilder(); for (int i = 0; i < hash.length; ++i) { sb.append(Integer.toHexString((hash[i] & 0xFF) | 0x100).toLowerCase().substring(1, 3)); } return sb.toString(); } catch (Exception e) { System.out.println("Error:" + e.getMessage()); return ""; } } //added by philipp.staender@gmail.com void insertToMongoDBAndMySQL() { String comment = ""; if (rev.Comment != null) comment = rev.Comment; String title = ""; if (page.Title != null) title = page.Title.toString(); String text = ""; if (rev.Text != null) text = rev.Text; text = text.replaceAll("(?s)<!--.*?-->", ""); //Der erste Absatz erhlt einfach nochmal den Artikeltitel text = "\n== " + title + " ==\n \n" + text; //unterteile text in unterberschriften String expression = "\\s+\\=\\=\\s+.+\\s+\\=\\=\\s+"; Matcher match = Pattern.compile(expression).matcher(text); String[] splittedText = text.split(expression); ArrayList<String> subtitles = new ArrayList<String>(); String subtitle = ""; while (match.find()) { subtitle = match.group().trim(); //entferne == ... == vom Titel subtitles.add(subtitle.substring(3, subtitle.length() - 3)); } try { DBCollection article; article = this.mongodbArticles; DBCollection textindex; textindex = this.mongodbTextindexes; BasicDBObject doc = new BasicDBObject(); doc.put("title", title); doc.put("comment", comment); String mongoid = XmlDumpReader.generateHashForID(text); doc.put("_id", mongoid); BasicDBObject textindizies = new BasicDBObject(); BasicDBObject content = new BasicDBObject(); BasicDBObject link = new BasicDBObject(); ArrayList<Object> paragraphs = new ArrayList<Object>(); ArrayList<String> links = new ArrayList<String>(); int sectionCount = -2; long textindexCount = 0; String sqlLinkValue = ""; Connection mysqlConnection = null; String redirect = ""; //ist artikel ein redirect? if (rev.Text.trim().toLowerCase().matches("\\A\\#(redirect|weiterleitung)\\s.*")) { String[] splittedRedirect = rev.Text.split( "\\#(redirect|weiterleitung|Weiterleitung|WEITERLEITUNG|Redirect|REDIRECT)\\s+\\[\\["); redirect = splittedRedirect[1].trim().substring(0, splittedRedirect[1].length() - 2); } //jedes einzelene unterkapitel wird zusaetzlich seperat gespeichert for (String string : splittedText) { sectionCount++; try { subtitle = subtitles.get(sectionCount); } catch (Exception e) { subtitle = ""; } //nur hinzufgen, wenn text vorhanden ist if (string.trim().length() > 0) { BasicDBObject paragraph = new BasicDBObject(); textindexCount++; if (redirect.length() > 0) { doc.put("redirect", redirect); System.out.println(title + " => " + redirect); } else { String textindexMongoID = XmlDumpReader .generateHashForID(text + "fortextsearch" + String.valueOf(sectionCount)); //erstelle unterteilungen, hier kapitel genannt //jedes kapitel wird nochmal in eine eigene collection gesetzt, fr Volltextsuche if (XmlDumpReader.generateTextIndizes) { textindizies.put("article", title); textindizies.put("order", (int) textindexCount); textindizies.put("title", subtitle); textindizies.put("text", string); textindizies.put("_id", textindexMongoID); } paragraph.put("subtitle", subtitle); paragraph.put("content", string); paragraphs.add(paragraph); //fge Links hinzu + notiere alle verlinkungen String linkExpression = "\\[\\[[0-9\\s\\'\\\"\\.\\-\\_\\p{L}]+\\]\\]"; Matcher matchLinks = Pattern.compile(linkExpression).matcher(string); String[] splittedLinks = text.split(linkExpression); ArrayList<String> linksParagraph = new ArrayList<String>(); String linkText = ""; int linksCount = -1; while (matchLinks.find()) { linksCount++; linkText = matchLinks.group().trim(); //entferne [[ ... ]] vom Titel linkText = linkText.substring(2, linkText.length() - 2); links.add(linkText); linksParagraph.add(linkText); sqlLinkValue = sqlLinkValue + "," + linkText; } //mysql + mongodb insert fr unterkapitel fr textindexierung if (XmlDumpReader.generateTextIndizes) { try { mysqlConnection = this.mysqlConnection; Statement stmt = (Statement) mysqlConnection.createStatement(); String sql = "INSERT INTO `textindex` (`ID` , `ArticleID`, `MongoID` , `Sort`, `Title` , `Text` , `Links`)" + "VALUES (" + "NULL , '" + lastInsertedArticleID + "', \"" + textindexMongoID + "\", " + (int) +textindexCount + ", " + XmlDumpReader.sqlEscape(subtitle) + ", " + XmlDumpReader.sqlEscape(string) + ", " + XmlDumpReader.sqlEscape(sqlLinkValue) + " " + ");"; stmt.executeUpdate(sql); } catch (SQLException e) { System.err.println("Fehler beim mysql insert von textindex: " + e.getMessage()); } textindizies.put("link", links); textindex.insert(textindizies); } } } } //mysql insert kompletter artikel try { mysqlConnection = this.mysqlConnection; Statement stmt = (Statement) mysqlConnection.createStatement(); if (sqlLinkValue.length() > 0) sqlLinkValue = sqlLinkValue.substring(1); String sql = "INSERT INTO `articles` (`ID` , `MongoID` , `Title` , `Redirect` , `Comment` , `Content`, `Links` )" + "VALUES (" + "NULL , \"" + mongoid + "\", " + XmlDumpReader.sqlEscape(title) + ", '" + redirect + "', " + XmlDumpReader.sqlEscape(comment) + ", " + XmlDumpReader.sqlEscape(rev.Text) + ", " + XmlDumpReader.sqlEscape(sqlLinkValue) + " " + ");"; stmt.executeUpdate(sql); this.lastInsertedArticleID++; //frage eingefuegte ID ab, da sie als Relation fuer die Absatze gebraucht wird //deaktiviert aus Performancegruenden /*sql = "SELECT `ID` FROM `articles` WHERE 1 ORDER BY `ID` DESC LIMIT 1;"; ResultSet lastArticle = stmt.executeQuery(sql); while (lastArticle.next()) { lastInsertedArticleID = lastArticle.getInt("ID"); }*/ } catch (SQLException e) { System.err.println("Fehler beim mysql insert: " + e.getMessage()); } //artikel in mongodb einfuegen doc.put("sections", paragraphs); doc.put("links", links); article.insert(doc); //Speicher freigeben article = null; mysqlConnection = null; System.out.println("'" + title + "' ... ok\n"); } catch (Exception e) { System.err.println("Fehler beim mongodb insert von '" + title + "': " + e.getMessage()); } } //added by philipp.staender@gmail.com //neu hinzugefgt, genommen aus SQLWriter protected static String sqlEscape(String str) { if (str.length() == 0) return "''"; //TODO "NULL",too ? final int len = str.length(); StringBuffer sql = new StringBuffer(len * 2); synchronized (sql) { //only for StringBuffer sql.append('\''); for (int i = 0; i < len; i++) { char c = str.charAt(i); switch (c) { case '\u0000': sql.append('\\').append('0'); break; case '\n': sql.append('\\').append('n'); break; case '\r': sql.append('\\').append('r'); break; case '\u001a': sql.append('\\').append('Z'); break; case '"': case '\'': case '\\': sql.append('\\'); // fall through default: sql.append(c); break; } } sql.append('\''); return sql.toString(); } } void readTimestamp() { rev.Timestamp = parseUTCTimestamp(bufferContents()); } void readComment() { rev.Comment = bufferContentsOrNull(); if (rev.Comment == null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed } void readMinor() { rev.Minor = true; } void readText() { rev.Text = bufferContentsOrNull(); if (rev.Text == null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed } // ----------- void openContributor() { //XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted" contrib = new Contributor(); } void closeContributor() { //NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object rev.Contributor = contrib; contrib = null; } void readUsername() { contrib.Username = bufferContentsOrNull(); } void readIp() { contrib.Username = bufferContents(); contrib.isIP = true; } private static final TimeZone utc = TimeZone.getTimeZone("UTC"); private static Calendar parseUTCTimestamp(String text) { // 2003-10-26T04:50:47Z // We're doing this manually for now, though DateFormatter might work... String trimmed = text.trim(); GregorianCalendar ts = new GregorianCalendar(utc); ts.set(Integer.parseInt(trimmed.substring(0, 0 + 4)), // year Integer.parseInt(trimmed.substring(5, 5 + 2)) - 1, // month is 0-based! Integer.parseInt(trimmed.substring(8, 8 + 2)), // day Integer.parseInt(trimmed.substring(11, 11 + 2)), // hour Integer.parseInt(trimmed.substring(14, 14 + 2)), // minute Integer.parseInt(trimmed.substring(17, 17 + 2))); // second return ts; } }