Java tutorial
/** * Copyright 2011 Applied Research in Patacriticism and the University of Virginia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package org.nines; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.StringReader; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringEscapeUtils; import org.apache.log4j.Logger; import org.openrdf.rio.ParseErrorListener; import org.openrdf.rio.RDFHandlerException; import org.openrdf.rio.RDFParseException; import org.openrdf.rio.rdfxml.RDFXMLParser; public class RdfDocumentParser { private static long largestTextSize = 0; public final static Logger log = Logger.getLogger(RdfDocumentParser.class.getName()); public static long getLargestTextSize() { return largestTextSize; } public static HashMap<String, HashMap<String, ArrayList<String>>> parse(final File file, ErrorReport errorReport, LinkCollector linkCollector, RDFIndexerConfig config) throws IOException { largestTextSize = 0; RDFXMLParser parser = new RDFXMLParser(); NinesStatementHandler statementHandler = new NinesStatementHandler(errorReport, linkCollector, config); statementHandler.setFile(file); parser.setRDFHandler(statementHandler); parser.setParseErrorListener(new ParseListener(file, errorReport)); parser.setVerifyData(true); parser.setStopAtFirstError(false); // parse file try { String content = validateContent(file, errorReport); parser.parse(new StringReader(content), "http://foo/" + file.getName()); } catch (RDFParseException e) { errorReport.addError(new IndexerError(file.getName(), "", "Parse Error on Line " + e.getLineNumber() + ": " + e.getMessage())); } catch (RDFHandlerException e) { errorReport.addError( new IndexerError(file.getName(), "", "StatementHandler Exception: " + e.getMessage())); } catch (Exception e) { errorReport.addError(new IndexerError(file.getName(), "", "RDF Parser Error: " + e.getMessage())); e.printStackTrace(); } // retrieve parsed data HashMap<String, HashMap<String, ArrayList<String>>> docHash = statementHandler .getDocuments(config.isPagesArchive()); // process tags Collection<HashMap<String, ArrayList<String>>> documents = docHash.values(); for (HashMap<String, ArrayList<String>> document : documents) { // normalize tags, replace spaces with dashes, lowercase ArrayList<String> tags = document.remove("tag"); if (tags != null) { for (int i = 0; i < tags.size(); i++) { String tag = tags.get(i); tag = tag.toLowerCase(); tag = tag.replaceAll(" ", "-"); tags.set(i, tag); } // username is archive name String archive = document.get("archive").get(0); ArrayList<String> nameList = new ArrayList<String>(); nameList.add(archive); document.put("username", nameList); document.put(archive + "_tag", tags); } } largestTextSize = statementHandler.getLargestTextSize(); return docHash; } private static String validateContent(File file, ErrorReport errorReport) { InputStreamReader is = null; try { Charset cs = Charset.availableCharsets().get("UTF-8"); CharsetDecoder decoder = cs.newDecoder(); decoder.onMalformedInput(CodingErrorAction.REPLACE); decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); is = new InputStreamReader(new FileInputStream(file), decoder); String content = IOUtils.toString(is); // look for unescaped sequences and flag them as trouble String unescaped = StringEscapeUtils.unescapeXml(content); int startPos = 0; while (true) { int pos = unescaped.indexOf("&#", startPos); if (pos > -1) { String snip = unescaped.substring(Math.max(0, pos - 25), Math.min(unescaped.length(), pos + 25)); IndexerError e = new IndexerError(file.getName(), "", "Potentially Invalid Escape sequence.\n Position: [" + pos + "]\n Snippet: [" + snip + "]"); errorReport.addError(e); startPos = pos + 2; } else { break; } } return content; } catch (IOException e) { errorReport .addError(new IndexerError(file.getName(), "", "Error validating content: " + e.getMessage())); } finally { IOUtils.closeQuietly(is); } return ""; } private static final class ParseListener implements ParseErrorListener { private ErrorReport errorReport; private File file; ParseListener(File file, ErrorReport errorReport) { this.errorReport = errorReport; this.file = file; } public void warning(String msg, int lineNo, int colNo) { this.errorReport.addError(new IndexerError(file.getName(), "", "Parse warning at line " + lineNo + ", col " + colNo + " : " + msg)); } public void error(String msg, int lineNo, int colNo) { this.errorReport.addError(new IndexerError(file.getName(), "", "Parse error at line " + lineNo + ", col " + colNo + " : " + msg)); } public void fatalError(String msg, int lineNo, int colNo) { this.errorReport.addError(new IndexerError(file.getName(), "", "FATAL PARSE ERROR at line " + lineNo + ", col " + colNo + " : " + msg)); } } }