Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.behemoth.util; import java.io.IOException; import java.net.URLEncoder; import java.util.UUID; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveOutputStream; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile.Reader; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.behemoth.BehemothConfiguration; import com.digitalpebble.behemoth.BehemothDocument; import com.digitalpebble.behemoth.DocumentFilter; /** * Stores the content from Behemoth documents into a local directory **/ public class ContentExtractor extends Configured implements Tool { private static final Logger LOG = LoggerFactory.getLogger(ContentExtractor.class); public static final String numEntriesPerArchiveParamName = "numEntriesPerArchive"; public enum FileNamingMode { URL, UUID, NUM; public static FileNamingMode toMode(String str) { try { return valueOf(str); } catch (Exception ex) { return UUID; } } } private FileNamingMode mode = FileNamingMode.UUID; // dump the text otherwise private boolean dumpBinary = false; private ArchiveOutputStream currentArchive = null; private FSDataOutputStream index = null; private int partNum = -1; private int numEntriesInCurrentArchive = 0; private int maxNumEntriesInArchive = 10000; public ContentExtractor() { } public static void main(String[] args) throws Exception { int res = ToolRunner.run(BehemothConfiguration.create(), new ContentExtractor(), args); System.exit(res); } public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "Behemoth corpus"); options.addOption("o", "output", true, "local corpus dir"); options.addOption("b", "binary", false, "dumps binary content, text otherwise"); options.addOption("n", "filenaming", true, "whether to name files based on URL, UUID (default) or NUM"); // parse the command line arguments try { CommandLine line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); if (line.hasOption("help")) { formatter.printHelp("ContentExtractor", options); return 0; } if (input == null || output == null) { formatter.printHelp("ContentExtractor", options); return -1; } dumpBinary = line.hasOption("binary"); if (line.hasOption("filenaming")) { String naming = line.getOptionValue("n"); mode = FileNamingMode.toMode(naming); } return generateDocs(input, output); } catch (ParseException e) { formatter.printHelp("ContentExtractor", options); return -1; } } private int generateDocs(String inputf, String outputf) throws IOException, ArchiveException { Path input = new Path(inputf); Path dirPath = new Path(outputf); FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); if (fsout.exists(dirPath) == false) fsout.mkdirs(dirPath); else { System.err.println("Output " + outputf + " already exists"); return -1; } // index file Path indexPath = new Path(dirPath, "index"); if (fsout.exists(indexPath) == false) { fsout.createNewFile(indexPath); } maxNumEntriesInArchive = getConf().getInt(numEntriesPerArchiveParamName, 10000); index = fsout.create(indexPath); createArchive(dirPath); FileSystem fs = input.getFileSystem(getConf()); FileStatus[] statuses = fs.listStatus(input); int count[] = { 0 }; for (int i = 0; i < statuses.length; i++) { FileStatus status = statuses[i]; Path suPath = status.getPath(); if (suPath.getName().equals("_SUCCESS")) continue; generateDocs(suPath, dirPath, count); } if (index != null) index.close(); if (currentArchive != null) { currentArchive.finish(); currentArchive.close(); } return 0; } private void createArchive(Path dirPath) throws IOException, ArchiveException { FileSystem fsout = FileSystem.get(dirPath.toUri(), getConf()); String archiveType = "zip"; partNum++; FSDataOutputStream currentArchiveOS = fsout .create(new Path(dirPath, "part_" + String.format("%06d", partNum) + "." + archiveType)); currentArchive = new ArchiveStreamFactory().createArchiveOutputStream(archiveType, currentArchiveOS); numEntriesInCurrentArchive = 0; } private void addToArchive(String fileName, byte[] content, Path dirPath) throws IOException, ArchiveException { numEntriesInCurrentArchive++; currentArchive.putArchiveEntry(new ZipArchiveEntry(fileName)); currentArchive.write(content); currentArchive.closeArchiveEntry(); index.flush(); if (numEntriesInCurrentArchive == maxNumEntriesInArchive) { currentArchive.finish(); currentArchive.close(); createArchive(dirPath); } } private void generateDocs(Path input, Path dir, int[] count) throws IOException, ArchiveException { DocumentFilter docFilter = DocumentFilter.getFilters(getConf()); Reader[] cacheReaders = SequenceFileOutputFormat.getReaders(getConf(), input); for (Reader current : cacheReaders) { // read the key + values in that file Text key = new Text(); BehemothDocument inputDoc = new BehemothDocument(); while (current.next(key, inputDoc)) { count[0]++; // filter the doc? if (!docFilter.keep(inputDoc)) continue; if (dumpBinary && inputDoc.getContent() == null) continue; else if (!dumpBinary && inputDoc.getText() == null) continue; String fileName = Integer.toString(count[0]); String urldoc = inputDoc.getUrl(); if (mode.equals(FileNamingMode.URL) && urldoc != null && urldoc.length() > 0) { fileName = URLEncoder.encode(urldoc, "UTF-8"); } else if (mode.equals(FileNamingMode.UUID) && urldoc != null && urldoc.length() > 0) { fileName = UUID.nameUUIDFromBytes(urldoc.getBytes()).toString(); } else { fileName = String.format("%09d", count[0]); } if (!dumpBinary) fileName += ".txt"; byte[] contentBytes; if (dumpBinary) contentBytes = inputDoc.getContent(); else contentBytes = inputDoc.getText().getBytes("UTF-8"); // out.write(contentBytes, 0, contentBytes.length); addToArchive(fileName, contentBytes, dir); // add the mapping URL->filename in the index -> archive num index.writeBytes(urldoc + "\t" + fileName + "\t" + String.format("%06d", partNum) + "\n"); } current.close(); } } }