Java tutorial
/** * Copyright 2011 Applied Research in Patacriticism and the University of Virginia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package org.nines; import java.io.File; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URL; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.*; import org.apache.log4j.Logger; import org.apache.log4j.xml.DOMConfigurator; import org.nines.RDFIndexerConfig.Mode; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; public class RDFIndexer { private int numFiles = 0; private int numObjects = 0; private int numReferences = 0; private long largestTextSize = 0; private RDFIndexerConfig config; private Queue<File> dataFileQueue; private ErrorReport errorReport; private LinkCollector linkCollector; private Logger log; private AsyncPoster asyncPoster; private JsonArray jsonPayload = new JsonArray(); private int postCount = 0; private SolrClient solrClient; private Date ts = new Date(); private SimpleDateFormat ts2 = new SimpleDateFormat("yyyy-MM-dd"); private String timeStamp = new String(ts2.format(ts)); // special field names private final String isPartOf = "isPartOf"; private final String hasPart = "hasPart"; /** * * @param config * @param config */ public RDFIndexer(RDFIndexerConfig config) { this.config = config; String logFileRoot = this.config.getLogfileBaseName(""); // setup logger String indexLog = this.config.getLogfileBaseName("progress") + "_progress.log"; System.setProperty("index.log.file", indexLog); URL url = ClassLoader.getSystemResource("log4j-index.xml"); DOMConfigurator.configure(url); this.log = Logger.getLogger(RDFIndexer.class.getName()); // keep report file in the same folder as the log file. String logName; if (this.config.mode.equals(Mode.INDEX) || this.config.mode.equals(Mode.TEST)) { logName = logFileRoot + "_error.log"; } else { logName = logFileRoot + "_" + this.config.mode.toString().toLowerCase() + "_error.log"; } File reportFile = new File(logName); try { this.errorReport = new ErrorReport(reportFile); } catch (IOException e1) { this.log.error("Unable to open error report log for writing, aborting indexer."); return; } this.linkCollector = new LinkCollector(this.config.getLogfileBaseName("links")); this.solrClient = new SolrClient(this.config.solrBaseURL); this.asyncPoster = new AsyncPoster(1); } /** * Execute the configured indexing task */ public void execute() { // There is only something else to do if a MODE was configured if (config.mode.equals(Mode.NONE) == false) { // first, ensure that core is valid and exists try { this.solrClient.validateCore(config.coreName()); } catch (IOException e) { this.errorReport.addError(new IndexerError("Validate core", "", e.getMessage())); } // if a purge was requested, it must be done FIRST if (config.deleteAll) { purgeArchive(config.coreName()); } // execute based on mode setting if (config.mode.equals(Mode.SPIDER)) { this.log.info("Full Text Spider Mode"); doSpidering(); } else if (config.mode.equals(Mode.CLEAN_RAW)) { this.log.info("Raw Text Cleanup Mode"); doRawTextCleanup(); } else if (config.mode.equals(Mode.CLEAN_FULL)) { this.log.info("Full Text Cleanup Mode"); doFullTextCleanup(); } else if (config.mode.equals(Mode.INDEX)) { this.log.info("Index Mode"); doIndexing(); } else if (config.mode.equals(Mode.RESOLVE)) { this.log.info("Resolve Mode"); doResolving(); } else { this.log.info("*** TEST MODE: Not committing changes to SOLR"); doIndexing(); } } this.asyncPoster.shutdown(); this.errorReport.close(); this.linkCollector.close(); } private void doFullTextCleanup() { Date start = new Date(); this.log.info("Started raw text cleanup at " + start); this.dataFileQueue = new LinkedList<File>(); String fullPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive(config.archiveName); recursivelyQueueFiles(new File(fullPath), false); int totalFiles = this.dataFileQueue.size(); FullTextCleaner cleaner = new FullTextCleaner(config.archiveName, this.errorReport, config.customCleanClass); while (this.dataFileQueue.size() > 0) { File txtFile = this.dataFileQueue.remove(); cleaner.clean(txtFile); this.errorReport.flush(); } String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength() + ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: " + cleaner.getTotalFilesChanged() + ")"; Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0))); } else { this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec)); } } private void doRawTextCleanup() { Date start = new Date(); log.info("Started raw text cleanup at " + start); this.dataFileQueue = new LinkedList<File>(); String rawPath = config.sourceDir.toString() + "/" + RDFIndexerConfig.safeArchive(config.archiveName); recursivelyQueueFiles(new File(rawPath), false); int totalFiles = this.dataFileQueue.size(); RawTextCleaner cleaner = new RawTextCleaner(config, this.errorReport); while (this.dataFileQueue.size() > 0) { File rawFile = this.dataFileQueue.remove(); cleaner.clean(rawFile); this.errorReport.flush(); } String stats = "Cleaned " + totalFiles + " files (Original Size: " + cleaner.getOriginalLength() + ", Cleaned Size: " + cleaner.getCleanedLength() + ", Total Files Cleaned: " + cleaner.getTotalFilesChanged() + ")"; Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("%s in %3.2f minutes.", stats, (durationSec / 60.0))); } else { this.log.info(String.format("%s in %3.2f seconds.", stats, durationSec)); } } /** * find the full path to the corrected text root baseed on * the path to the original rdf sources * @return */ private String findCorrectedTextRoot() { String path = config.sourceDir.toString(); int pos = path.indexOf("/rdf/"); path = path.substring(0, pos) + "/correctedtext/"; path += RDFIndexerConfig.safeArchive(config.archiveName) + "/"; return path; } private void doIndexing() { Date start = new Date(); log.info("Started indexing at " + start); System.out.println("Indexing " + config.sourceDir); indexDirectory(config.sourceDir); System.out.println("Indexing DONE"); // report indexing stats Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info( String.format("Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f minutes.", (durationSec / 60.0))); } else { this.log.info(String.format( "Indexed " + numFiles + " files (" + numObjects + " objects) in %3.2f seconds.", durationSec)); } this.log.info("Largest text field size: " + this.largestTextSize); } private void doResolving() { Date start = new Date(); log.info("Started resolving at " + start); System.out.println("Started resolving at " + start); updateReferenceFields(); System.out.println("Resolving DONE"); // report indexing stats Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("Resolved/updated " + numReferences + " references in %3.2f minutes.", (durationSec / 60.0))); } else { this.log.info(String.format("Resolved/updated " + numReferences + " references in %3.2f seconds.", durationSec)); } } private void doSpidering() { Date start = new Date(); log.info("Started full-text spider at " + start); System.out.println("Full-text spider of " + config.sourceDir); spiderDirectory(config.sourceDir); System.out.println("DONE"); // report indexing stats Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { this.log.info(String.format("Spidered " + numFiles + " files in %3.2f minutes.", (durationSec / 60.0))); } else { this.log.info(String.format("Spidered " + numFiles + " files in %3.2f seconds.", durationSec)); } } private void purgeArchive(final String coreName) { log.info("Deleting all data from: " + coreName); try { this.solrClient.postJSON("{\"delete\": { \"query\": \"*:*\"}, \"commit\": {}}", coreName); } catch (IOException e) { errorReport.addError( new IndexerError("", "", "Unable to POST DELETE message to SOLR. " + e.getLocalizedMessage())); } } private void recursivelyQueueFiles(final File dir, final boolean rdfMode) { if (dir.isDirectory()) { log.info("loading directory: " + dir.getPath()); File fileList[] = dir.listFiles(); for (File entry : fileList) { if (entry.getName().endsWith(".svn") || entry.getName().endsWith(".git")) { log.info("Skipping source control directory"); continue; } if (entry.isDirectory()) { recursivelyQueueFiles(entry, rdfMode); } if (rdfMode) { if (entry.getName().endsWith(".rdf") || entry.getName().endsWith(".xml")) { this.dataFileQueue.add(entry); } } else { this.dataFileQueue.add(entry); } } } else { // a file was passed in, not a folder this.log.info("loading file: " + dir.getPath()); this.dataFileQueue.add(dir); } } /** * Run through all rdf files in the directory and harvest full text * from remote sites. * * @param rdfDir */ private void spiderDirectory(final File rdfDir) { this.dataFileQueue = new LinkedList<File>(); recursivelyQueueFiles(rdfDir, true); this.numFiles = this.dataFileQueue.size(); log.info("=> Spider text for " + rdfDir + " total files: " + this.numFiles); RdfTextSpider spider = new RdfTextSpider(config, this.errorReport); while (this.dataFileQueue.size() > 0) { File rdfFile = this.dataFileQueue.remove(); this.log.info("Spider text from file " + rdfFile.toString()); spider.spider(rdfFile); try { Thread.sleep(10); } catch (InterruptedException e) { } this.errorReport.flush(); } } /** * run through all RDF files in the directory and write them * to a solr archive. * * @param rdfDir */ private void indexDirectory(File rdfDir) { // see if corrected texts exist. config.correctedTextDir = new File(findCorrectedTextRoot()); if (config.correctedTextDir.exists()) { // it does; grab a list of filenames that have corrected text and cache them. // The file names are URIs with ugly characters replaces. Rules... // '/' is replaced by _S_ and ':' by _C_ // Undo this and save a list of corrected doc URIs for (File entry : config.correctedTextDir.listFiles()) { if (entry.getName().endsWith(".txt")) { config.correctedTextMap.put( entry.getName().replaceAll("_C_", ":").replaceAll("_S_", "\\/").replaceAll(".txt", ""), entry.getName()); } } } this.dataFileQueue = new LinkedList<File>(); recursivelyQueueFiles(rdfDir, true); this.numFiles = this.dataFileQueue.size(); log.info("=> Indexing " + rdfDir + " total files: " + this.numFiles); while (this.dataFileQueue.size() > 0) { File rdfFile = this.dataFileQueue.remove(); indexFile(rdfFile); } if (config.isTestMode() == false) { // flush any remaining data flush(); // commit the changes and wait for all the workers to complete this.asyncPoster.asyncCommit(this.solrClient, config.coreName()); this.asyncPoster.waitForPending(); // if we actually processed any documents, process any isPartOf or hasPart references if (this.numObjects != 0 && this.config.isPagesArchive() == false) { updateReferenceFields(); } } } private void indexFile(File file) { HashMap<String, HashMap<String, ArrayList<String>>> objects; // Parse a file into a hashmap. // Key is object URI, Value is a set of key-value pairs // that describe the object try { objects = RdfDocumentParser.parse(file, this.errorReport, this.linkCollector, config); } catch (IOException e) { this.errorReport.addError(new IndexerError(file.getName(), "", e.getMessage())); return; } // Log an error for no objects and bail if size is zero if (objects == null || objects.size() == 0) { errorReport.addError(new IndexerError(file.getName(), "", "No objects in this file.")); errorReport.flush(); return; } // save the largest text field size this.largestTextSize = Math.max(this.largestTextSize, RdfDocumentParser.getLargestTextSize()); for (Map.Entry<String, HashMap<String, ArrayList<String>>> entry : objects.entrySet()) { String uri = entry.getKey(); HashMap<String, ArrayList<String>> object = entry.getValue(); // Validate archive and push objects into new archive map ArrayList<String> objectArray = object.get("archive"); if (objectArray != null) { String objArchive = objectArray.get(0); if (!objArchive.equals(config.archiveName)) { this.errorReport.addError(new IndexerError(file.getName(), uri, "The wrong archive was found. " + objArchive + " should be " + config.archiveName)); } } else { this.errorReport.addError( new IndexerError(file.getName(), uri, "Unable to determine archive for this object.")); } // validate all other parts of object and generate error report try { ArrayList<String> messages = ValidationUtility.validateObject(this.config.isPagesArchive(), object); for (String message : messages) { IndexerError e = new IndexerError(file.getName(), uri, message); errorReport.addError(e); } } catch (Exception valEx) { System.err.println("ERROR Validating file:" + file.getName() + " URI: " + uri); valEx.printStackTrace(); IndexerError e = new IndexerError(file.getName(), uri, valEx.getMessage()); errorReport.addError(e); } // turn this object into an XML solr docm then xml string. Add this to the curr payload JsonElement jsonDoc = docToJson(uri, object); this.jsonPayload.add(jsonDoc); if (config.isTestMode() == false) { flushIfEnough(); } } this.numObjects += objects.size(); this.errorReport.flush(); } // // update the references for any isPartOf or hasPart fields // private void updateReferenceFields() { int size = config.pageSize; String fl = config.getFieldList(); String coreName = config.coreName(); List<String> orList = new ArrayList<String>(); orList.add(isPartOf + "=http*"); orList.add(hasPart + "=http*"); while (true) { List<JsonObject> results = this.solrClient.getResultsPage(coreName, config.archiveName, 0, size, fl, null, orList); if (results.isEmpty() == true) { log.info("No more references to resolve"); break; } log.info("Got " + results.size() + " references to resolve"); for (JsonObject json : results) { log.info("Resolving references for " + json.get("uri").getAsString()); updateDocumentReferences(json); this.numReferences++; } // flush any data and wait for completion... flush(); // commit the changes and wait for all the workers to complete this.asyncPoster.asyncCommit(this.solrClient, config.coreName()); this.asyncPoster.waitForPending(); } } // // resolve the isPartOf or hasPart references for the specified document // private void updateDocumentReferences(final JsonObject json) { String fl = config.getFieldList(); String coreName = config.coreName(); String uri = json.get("uri").getAsString(); boolean updated = false; try { if (json.has(isPartOf) == true) { JsonArray refs = json.getAsJsonArray(isPartOf); //log.info( "isPartOf: " + refs.toString( ) ); JsonArray objs = new JsonArray(); for (int ix = 0; ix < refs.size(); ix++) { List<String> andList = new ArrayList<String>(); andList.add("uri=" + URLEncoder.encode("\"" + refs.get(ix).getAsString() + "\"", "UTF-8")); List<JsonObject> results = this.solrClient.getResultsPage(coreName, config.archiveName, 0, 1, fl, andList, null); if (results.isEmpty() == false) { objs.add(removeExcessFields(results.get(0))); } else { // reference to a non-existent object, note in the error log IndexerError e = new IndexerError("", uri, "Cannot resolve isPartOf reference (" + refs.get(ix).getAsString() + ") for document " + uri); errorReport.addError(e); } } // remove the field; we may replace it with resolved data json.remove(isPartOf); updated = true; // did we resolve any of the references if (objs.size() != 0) { //log.info( "UPDATING isPartOf: " + objs.toString( ) ); json.addProperty(isPartOf, objs.toString()); } } if (json.has(hasPart) == true) { JsonArray refs = json.getAsJsonArray(hasPart); //log.info( "hasPart: " + refs.toString( ) ); JsonArray objs = new JsonArray(); for (int ix = 0; ix < refs.size(); ix++) { List<String> andList = new ArrayList<String>(); andList.add("uri=" + URLEncoder.encode("\"" + refs.get(ix).getAsString() + "\"", "UTF-8")); List<JsonObject> results = this.solrClient.getResultsPage(coreName, config.archiveName, 0, 1, fl, andList, null); if (results.isEmpty() == false) { objs.add(removeExcessFields(results.get(0))); } else { // reference to a non-existent object, note in the error log IndexerError e = new IndexerError("", uri, "Cannot resolve hasPart reference (" + refs.get(ix).getAsString() + ") for document " + uri); errorReport.addError(e); } } // remove the field; we may replace it with resolved data json.remove(hasPart); updated = true; if (objs.size() != 0) { //log.info( "UPDATING hasPart: " + objs.toString( ) ); json.addProperty(hasPart, objs.toString()); } } if (updated == true) { this.jsonPayload.add(json); flushIfEnough(); } } catch (UnsupportedEncodingException ex) { // should never happen } } // // remove the fields we do not want for reference documents // private JsonObject removeExcessFields(JsonObject json) { json.remove(isPartOf); json.remove(hasPart); json.remove("text"); json.remove("_version_"); json.remove("year_sort_desc"); json.remove("federation"); json.remove("year"); json.remove("decade"); json.remove("year_sort"); json.remove("year_sort_asc"); json.remove("title_sort"); json.remove("author_sort"); json.remove("date_created"); json.remove("date_updated"); json.remove("century"); json.remove("half_century"); json.remove("quarter_century"); return (json); } private JsonElement docToJson(String documentName, HashMap<String, ArrayList<String>> fields) { Gson gson = new Gson(); JsonObject obj = gson.toJsonTree(fields).getAsJsonObject(); obj.addProperty("date_created", this.timeStamp); obj.addProperty("date_updated", this.timeStamp); return obj; } private void flushIfEnough() { if (this.jsonPayload.toString().length() >= config.maxUploadSize) flushPending(); } private void flush() { if (this.jsonPayload.size() > 0) flushPending(); } // flush pending data to SOLR private void flushPending() { this.asyncPoster.asyncPost(this.solrClient, config.coreName(), this.jsonPayload.toString()); this.jsonPayload = new JsonArray(); this.postCount++; if (postCount % 5 == 0) { this.asyncPoster.asyncCommit(this.solrClient, config.coreName()); } } }