Java tutorial
/* * Copyright 2014 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.lt.ltbot.writer; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; import java.util.concurrent.atomic.AtomicLong; import java.util.logging.Level; import java.util.logging.Logger; import java.util.zip.GZIPOutputStream; import org.apache.commons.lang.StringUtils; import org.archive.io.RecordingInputStream; import org.archive.modules.CrawlURI; import org.archive.modules.Processor; import org.archive.spring.ConfigPath; import org.archive.util.FileUtils; import de.tudarmstadt.lt.ltbot.postprocessor.SharedConstants; import de.tudarmstadt.lt.ltbot.text.TextExtractor; import de.tudarmstadt.lt.utilities.TimeUtils; /** * @author Steffen Remus * */ public class SentenceWriter extends Processor { private static final Logger LOG = Logger.getLogger(SentenceWriter.class.getName()); private final static String SENTENCE_EXTRACT = "sent"; private final static String FILE_NUMBER_REPLACEMENT = "{num}"; private final static String TIME_REPLACEMENT = "{time}"; private Object _lck = new Object(); /** * */ public SentenceWriter() { } /** * plain text extractor */ protected TextExtractor _textExtractorInstance; public TextExtractor getTextExtractor() { return _textExtractorInstance; } public void setTextExtractor(TextExtractor text_extractor) { _textExtractorInstance = text_extractor; } /** * Split and clean sentences from plaintext */ protected SentenceMaker _sentence_maker; public SentenceMaker getSentenceMaker() { return _sentence_maker; } public void setSentenceMaker(SentenceMaker sentence_maker) { _sentence_maker = sentence_maker; } /** * Top-level directory for sentence files. */ protected ConfigPath _path = new ConfigPath("Sentence writer directory", "${launchId}/sentences"); public ConfigPath getPath() { return _path; } public void setPath(ConfigPath newpath) { _path = newpath; } /** * Language code to focus on for splitting sentences */ protected String _languagecode = "default"; public String getLanguageCode() { return _languagecode; } public void setLanguageCode(String languagecode) { _languagecode = languagecode; } /** * Max size of each file. */ protected long _maxFileSizeBytes = 100 * 1000000; // 100MB public long getMaxFileSizeBytes() { return _maxFileSizeBytes; } public void setMaxFileSizeBytes(long maxFileSizeBytes) { _maxFileSizeBytes = maxFileSizeBytes; } protected String _prefix = ""; public String getPrefix() { return _prefix; } public void setPrefix(String prefix) { _prefix = prefix; } /** * Filename format * * [{prefix}-]{time}-{num}.txt.gz * */ protected String _filename_format = String.format("%s-%s.txt", TIME_REPLACEMENT, FILE_NUMBER_REPLACEMENT); public String getFilenameFormat() { return _filename_format; } public void setFilenameFormat(String filename_format) { _filename_format = filename_format; } File _current_file = null; PrintStream _current_stream = null; int _num_current_file = -1; AtomicLong _num_sentences = new AtomicLong(); AtomicLong _num_uris = new AtomicLong(); AtomicLong _num_uris_written = new AtomicLong(); AtomicLong _num_bytes_written = new AtomicLong(); AtomicLong _num_bytes_sentences_written = new AtomicLong(); /* (non-Javadoc) * @see org.archive.modules.Processor#start() */ @Override public void start() { super.start(); } /* (non-Javadoc) * @see org.archive.modules.Processor#stop() */ @Override public void stop() { if (_current_stream != null) _current_stream.close(); _current_stream = null; super.stop(); } /* (non-Javadoc) * @see org.archive.modules.Processor#shouldProcess(org.archive.modules.CrawlURI) */ @Override protected boolean shouldProcess(CrawlURI curi) { return isSuccess(curi) && curi.getContentLength() > 0 && curi.getFetchStatus() >= 200 && curi.getFetchStatus() <= 207; } /* (non-Javadoc) * @see org.archive.modules.Processor#innerProcess(org.archive.modules.CrawlURI) */ @Override protected void innerProcess(CrawlURI curi) throws InterruptedException { try { File basedir = getPath().getFile(); FileUtils.ensureWriteableDirectory(basedir); } catch (IOException e) { throw new RuntimeException(String.format("Could not ensure writeable base directory '%s'. %s: %s.", getPath(), e.getClass().getName(), e.getMessage()), e); } _num_uris.getAndIncrement(); RecordingInputStream recis = curi.getRecorder().getRecordedInput(); if (0L == recis.getResponseContentLength()) { return; } // content already written for this URI. boolean is_revisited = curi.getData().containsKey(SENTENCE_EXTRACT); if (is_revisited) return; try { String cleaned_plaintext = _textExtractorInstance.getCleanedUtf8PlainText(curi); updateOuputFile(); writeSentences(curi, cleaned_plaintext); String cleaned_plaintext_abbr = StringUtils.abbreviate(cleaned_plaintext, 50); curi.getData().put(SENTENCE_EXTRACT, cleaned_plaintext_abbr); } catch (IOException e) { curi.getNonFatalFailures().add(e); } } /** * @param cleaned_plaintext */ protected void writeSentences(CrawlURI curi, String cleaned_plaintext) { String perplexity_value_as_string = "null"; if (curi != null && curi.getData() != null) { Object obj = curi.getData().get(SharedConstants.EXTRA_INFO_PERPLEXITY); if (obj != null) perplexity_value_as_string = (String) obj; } String time = TimeUtils.get_ISO_8601_UTC(); int c = 0; synchronized (_lck) { for (String sentence : _sentence_maker.getSentences(cleaned_plaintext, _languagecode)) { String s = String.format("%s\t%s\t%s\t%d\t%s%n", time, sentence, curi, ++c, perplexity_value_as_string); _current_stream.print(s); _num_bytes_written.getAndAdd(s.getBytes().length); _num_bytes_sentences_written.getAndAdd(sentence.getBytes().length); } _current_stream.flush(); } _num_sentences.getAndAdd(c); _num_uris_written.getAndIncrement(); } protected File updateOuputFile() throws IOException { File basedir = getPath().getFile(); File out = _current_file; if (out == null || out.length() > _maxFileSizeBytes) { synchronized (_lck) { if (_current_stream != null) _current_stream.close(); int not_ok_count = 0; while (not_ok_count > -1 && not_ok_count < 10) { for (++_num_current_file; out == null || out.exists(); ++_num_current_file) { out = new File(basedir, getFilename()); } _current_file = out; try { _current_stream = openPrintToFileStream(_current_file); } catch (Throwable t) { for (int i = 1; t != null && i < 10; i++) { String message = String.format("Failed to open file for writing: '%s'. (%d %s:%s)", _current_file.getAbsolutePath(), i, t.getClass().getSimpleName(), t.getMessage()); LOG.log(Level.SEVERE, message, t); t = t.getCause(); } not_ok_count++; if (not_ok_count >= 10) throw new IOException(String.format( "Failed to open file for writing: '%s'. I tried %d times but I give up now.", _current_file.getAbsolutePath(), not_ok_count)); continue; } // break this loop, we're ok now not_ok_count = -1; break; } } } return out; } /** * @param num_current_file * @return */ protected String getFilename() { String n = String.format("%05d", _num_current_file); String filename = getFilenameFormat(); if (_prefix != null && !_prefix.isEmpty()) filename = _prefix + "-" + filename; if (filename.contains(TIME_REPLACEMENT)) filename = filename.replace(TIME_REPLACEMENT, TimeUtils.getSimple17()); if (filename.contains(FILE_NUMBER_REPLACEMENT)) filename = filename.replace(FILE_NUMBER_REPLACEMENT, n); else filename += "-" + n; return filename; } protected PrintStream openPrintToFileStream(File outputfile) throws IOException { OutputStream os = new FileOutputStream(outputfile, true); if (getFilenameFormat().endsWith(".gz")) { os = new GZIPOutputStream(os); //{{ def.setLevel(Deflater.BEST_COMPRESSION); }}; } PrintStream p = new PrintStream(os); p.flush(); return p; } /* (non-Javadoc) * @see org.archive.modules.Processor#report() */ @Override public String report() { StringBuilder b = new StringBuilder(); b.append(super.report()); b.append(String.format(" %-30.30s %d %n", "Number of Sentences: ", _num_sentences.get())); b.append(String.format(" %-30.30s %d %n", "Number of processed URIs: ", _num_uris.get())); b.append(String.format(" %-30.30s %d %n", "Number of written URIs: ", _num_uris_written.get())); b.append(String.format(" %-30.30s %d %n", "Number of Bytes written: ", _num_bytes_written.get())); b.append(String.format(" %-30.30s %.3f %n", "Number of MBytes written: ", _num_bytes_written.get() / (1000d * 1000d))); b.append(String.format(" %-30.30s %.3f %n", "Number of GBytes written: ", _num_bytes_written.get() / (1000d * 1000d * 1000d))); b.append(String.format(" %-30.30s %d %n", "Number of Sentence Bytes written: ", _num_bytes_sentences_written.get())); b.append(String.format(" %-30.30s %.3f %n", "Number of Sentence MBytes written: ", _num_bytes_sentences_written.get() / (1000d * 1000d))); b.append(String.format(" %-30.30s %.3f %n", "Number of Sentence GBytes written: ", _num_bytes_sentences_written.get() / (1000d * 1000d * 1000d))); return b.toString(); } }