Java tutorial
/** * Copyright 2011 Applied Research in Patacriticism and the University of Virginia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package org.nines; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import org.apache.commons.io.IOUtils; import org.apache.log4j.Logger; /** * Cleaner for full text files. It will fix escape sequences, strip bad * utf-8 characters and normalize whitespace. The result will overwrite * the prior full text file. It should ony be run once * * @author loufoster * */ public class FullTextCleaner { private CharsetDecoder decoder; private ErrorReport errorReport; private String archiveName; private Logger log; private String custom; private long totalOrigChars = 0; private long totalFilesChanged = 0; private long totalCleanedChars = 0; public FullTextCleaner(String archiveName, ErrorReport errorReport, String custom) { this.errorReport = errorReport; this.archiveName = archiveName; this.log = Logger.getLogger(FullTextCleaner.class.getName()); this.custom = custom; Charset cs = Charset.availableCharsets().get("UTF-8"); this.decoder = cs.newDecoder(); this.decoder.onMalformedInput(CodingErrorAction.REPLACE); this.decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); } public void clean(File txtFile) { this.log.info("Clean full text from file " + txtFile.toString()); // Read the text from the file. Bail if this fails String content = null; InputStreamReader is = null; try { is = new InputStreamReader(new FileInputStream(txtFile), this.decoder); content = IOUtils.toString(is); } catch (Exception e) { this.errorReport.addError( new IndexerError(txtFile.toString(), "", "Unable to read full text file: " + e.toString())); return; } finally { IOUtils.closeQuietly(is); } // stats! long startChars = content.length(); ; this.totalOrigChars += startChars; // clean it up content = TextUtils.stripEscapeSequences(content, this.errorReport, txtFile); content = TextUtils.normalizeWhitespace(content); // Look for unknown character and warn int pos = content.indexOf("\ufffd"); if (pos > -1) { String snip = content.substring(Math.max(0, pos - 25), Math.min(content.length(), pos + 25)); errorReport.addError(new IndexerError(txtFile.toString(), "", "Invalid UTF-8 character at position " + pos + "\n Snippet: [" + snip + "]")); } if (custom != null && custom.length() > 0) { try { @SuppressWarnings("rawtypes") Class newClass = Class.forName("org.nines.cleaner." + this.custom); ICustomCleaner cleaner = (ICustomCleaner) newClass.newInstance(); content = cleaner.clean(this.archiveName, content); } catch (Exception e) { errorReport.addError(new IndexerError(txtFile.toString(), "", "Unable to run custom cleaner " + this.custom + ": " + e.toString())); } } // final stats long endChars = content.length(); this.totalCleanedChars += endChars; if (endChars != startChars) { this.totalFilesChanged++; } this.log.info(" => Original length: " + startChars + ", Cleaned length: " + endChars + ", Delta:" + (startChars - endChars)); // write out the cleaned content over the existing content Writer outWriter = null; try { outWriter = new OutputStreamWriter(new FileOutputStream(txtFile), "UTF-8"); outWriter.write(content); } catch (IOException e) { this.errorReport.addError( new IndexerError(txtFile.toString(), "", "Unable to write cleaned text file: " + e.toString())); } finally { IOUtils.closeQuietly(outWriter); } } public long getTotalFilesChanged() { return this.totalFilesChanged; } public long getOriginalLength() { return this.totalOrigChars; } public long getCleanedLength() { return this.totalCleanedChars; } }