Java tutorial
/** * Copyright 2015 DBpedia Spotlight * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dbpedia.spotlight.parser; import info.bliki.api.Connector; import info.bliki.wiki.dump.IArticleFilter; import info.bliki.wiki.dump.Siteinfo; import info.bliki.wiki.dump.WikiArticle; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.util.zip.GZIPInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; /** * A Wikipedia XML dump file parser * * Original version with permission from Marco Schmidt. See: <a * href="http://schmidt.devlib.org/software/lucene-wikipedia.html" * >http://schmidt.devlib.org/software/lucene-wikipedia.html</a> * * @author Marco Schmidt * */ public class WikiXMLParser extends DefaultHandler { private static final String WIKIPEDIA_SITEINFO = "siteinfo"; private static final String WIKIPEDIA_TITLE = "title"; private static final String WIKIPEDIA_TEXT = "text"; private static final String WIKIPEDIA_PAGE = "page"; private static final String WIKIPEDIA_REVISION = "revision"; private static final String WIKIPEDIA_NAMESPACE = "namespace"; private static final String WIKIPEDIA_TIMESTAMP = "timestamp"; private static final String WIKIPEDIA_ID = "id"; private Siteinfo fSiteinfo = null; private String fNamespaceKey = null; private WikiArticle fArticle; private boolean fRevision; private StringBuilder fData; private XMLReader fXMLReader; private Reader fReader; private IArticleFilter fArticleFilter; public WikiXMLParser(String filename, IArticleFilter filter) throws UnsupportedEncodingException, IOException, SAXException, FileNotFoundException { this(getBufferedReader(filename), filter); } public WikiXMLParser(String XMLInput, IArticleFilter filter, boolean JsonFile) throws UnsupportedEncodingException, IOException, SAXException, FileNotFoundException { this(getBufferedReaderFromString(XMLInput), filter); } public WikiXMLParser(InputStream inputStream, IArticleFilter filter) throws SAXException { super(); try { fArticleFilter = filter; fXMLReader = XMLReaderFactory.createXMLReader(); fXMLReader.setContentHandler(this); fXMLReader.setErrorHandler(this); fReader = new BufferedReader(new InputStreamReader(inputStream, Connector.UTF8_CHARSET)); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } public WikiXMLParser(Reader reader, IArticleFilter filter) throws SAXException { super(); fArticleFilter = filter; fXMLReader = XMLReaderFactory.createXMLReader(); fXMLReader.setContentHandler(this); fXMLReader.setErrorHandler(this); fReader = reader; } /** * * @return a BufferedReader created from wikiDumpFilename * @throws UnsupportedEncodingException * */ public static BufferedReader getBufferedReader(String wikiDumpFilename) throws UnsupportedEncodingException, FileNotFoundException, IOException { BufferedReader br = null; if (wikiDumpFilename.endsWith(".gz")) { br = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(wikiDumpFilename)), "UTF-8")); } else if (wikiDumpFilename.endsWith(".bz2")) { FileInputStream fis = new FileInputStream(wikiDumpFilename); br = new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(fis), "UTF-8")); } else { br = new BufferedReader(new InputStreamReader(new FileInputStream(wikiDumpFilename), "UTF-8")); } return br; } /** * * @return a BufferedReader created from wikiDumpFilename * @throws UnsupportedEncodingException * */ public static BufferedReader getBufferedReaderFromString(String XMLInput) throws UnsupportedEncodingException, FileNotFoundException, IOException { BufferedReader br = null; InputStream is = new ByteArrayInputStream(XMLInput.getBytes()); br = new BufferedReader(new InputStreamReader(is)); return br; } private String getString() { if (fData == null) { return null; } else { String s = fData.toString(); fData = null; return s; } } @Override public void startDocument() { // System.out.println("START"); } @Override public void endDocument() { // System.out.println("END"); } @Override public void startElement(String namespaceURI, String localName, String qName, Attributes atts) { // fAttributes = atts; fData = null; if (WIKIPEDIA_SITEINFO.equals(qName)) { fSiteinfo = new Siteinfo(); return; } if (fArticle == null) { fNamespaceKey = null; if (fSiteinfo != null) { if (WIKIPEDIA_NAMESPACE.equals(qName)) { fNamespaceKey = atts.getValue("key"); return; } } } if (WIKIPEDIA_PAGE.equals(qName)) { fArticle = new WikiArticle(); fRevision = false; } if (WIKIPEDIA_REVISION.equals(qName)) { fRevision = true; } } @Override public void endElement(String uri, String name, String qName) throws SAXException { try { if (fArticle == null) { if (fSiteinfo != null) { if (WIKIPEDIA_NAMESPACE.equals(qName) && fNamespaceKey != null) { fSiteinfo.addNamespace(fNamespaceKey, getString()); } else if ("sitename".equals(qName)) { fSiteinfo.setSitename(getString()); } else if ("base".equals(qName)) { fSiteinfo.setBase(getString()); } else if ("generator".equals(qName)) { fSiteinfo.setGenerator(getString()); } else if ("case".equals(qName)) { fSiteinfo.setCharacterCase(getString()); } } } else { if (WIKIPEDIA_PAGE.equals(qName)) { } else if (WIKIPEDIA_TEXT.equals(qName)) { fArticle.setText(getString()); fArticleFilter.process(fArticle, fSiteinfo); // emit(wikiText); } else if (WIKIPEDIA_TITLE.equals(qName)) { fArticle.setTitle(getString(), fSiteinfo); } else if (WIKIPEDIA_TIMESTAMP.equals(qName)) { fArticle.setTimeStamp(getString()); } else if (!fRevision && WIKIPEDIA_ID.equals(qName)) { // get the id from wiki page, not the id from the revision fArticle.setId(getString()); } } fData = null; // fAttributes = null; } catch (RuntimeException re) { re.printStackTrace(); } } /** * parse an unlimited amount of characters between 2 enclosing XML-Tags * * @see org.xml.sax.helpers.DefaultHandler#characters(char[], int, int) */ @Override public void characters(char[] ch, int start, int length) throws SAXException { if (fData == null) { fData = new StringBuilder(length); } fData.append(ch, start, length); } public void parse() throws IOException, SAXException { fXMLReader.parse(new InputSource(fReader)); } }