Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.nutchwax; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.Writer; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.fetcher.FetcherOutputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseText; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; import org.archive.io.ArchiveReader; import org.archive.io.ArchiveReaderFactory; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; /** * Import Archive files (.arc/.warc) files into a newly-created Nutch segment. * * <code>Importer</code> is coded as a Hadoop job and is intended to be run * within the Hadoop framework, or at least started by the Hadoop launcher * incorporated into Nutch. Although there is a <code>main</code> driver, the * Nutch launcher script is strongly recommended. * * This class was initially adapted from the Nutch <code>Fetcher</code> and * <code>ArcSegmentCreator</code> classes. The premise is since the Nutch * fetching process acquires external content and places it in a Nutch segment, * we can perform a similar activity by taking content from the ARC files and * place that content in a Nutch segment in a similar fashion. Ideally, once the * <code>Importer</code> is used to import a set of ARCs into a Nutch segment, * the resulting segment should be more-or-less the same as one created by * Nutch's own Fetcher. * * Since we are mimicing the Nutch Fetcher, we have to be careful about some * implementation details that might not seem relevant to the importing of ARC * files. I've noted those details with comments prefaced with "?:". */ public class ImporterToHdfs extends Configured implements Tool, Mapper<WritableComparable, Writable, Text, NutchWritable> { public static final Log LOG = LogFactory.getLog(ImporterToHdfs.class); private JobConf jobConf; private URLFilters urlFilters; private ScoringFilters scfilters; private ParseUtil parseUtil; private URLNormalizers normalizers; private int interval; private HTTPStatusCodeFilter httpStatusCodeFilter; private String seqFilePrefix; private String seqFileSuffix; private String seqFilePath; /** * ?: Is this necessary? */ public ImporterToHdfs() { } /** * <p> * Constructor that sets the job configuration. * </p> * * @param conf */ public ImporterToHdfs(Configuration conf) { setConf(conf); } /** * <p> * Configures the job. Sets the url filters, scoring filters, url * normalizers and other relevant data. * </p> * * @param job * The job configuration. */ public void configure(JobConf job) { // set the url filters, scoring filters the parse util and the url // normalizers this.jobConf = job; this.urlFilters = new URLFilters(jobConf); this.scfilters = new ScoringFilters(jobConf); this.parseUtil = new ParseUtil(jobConf); this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER); this.interval = jobConf.getInt("db.fetch.interval.default", 2592000); this.httpStatusCodeFilter = new HTTPStatusCodeFilter(jobConf.get("nutchwax.filter.http.status")); this.seqFilePrefix = jobConf.get("nutchwax.importer.hdfs.seqfileprefix"); this.seqFileSuffix = jobConf.get("nutchwax.importer.hdfs.seqfilesuffix"); this.seqFilePath = jobConf.get("nutchwax.importer.hdfs.seqfilepath"); } /** * In Mapper interface. * * @inherit */ public void close() { } /** * <p> * Runs the Map job to import records from an archive file into a Nutch * segment. * </p> * * @param key * Line number in manifest corresponding to the * <code>value</code> * @param value * A line from the manifest * @param output * The output collecter. * @param reporter * The progress reporter. */ public void map(final WritableComparable key, final Writable value, final OutputCollector output, final Reporter reporter) throws IOException { boolean success = false; String arcUrl = ""; String collection = ""; String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY); // First, ignore blank manifest lines, and those that are comments. String line = value.toString().trim(); if (line.length() == 0 || line.charAt(0) == '#') { // Ignore it. return; } // Each line of the manifest is "<url> <collection>" where <collection> // is optional String[] parts = line.split("\\s+"); arcUrl = parts[0]; if (parts.length > 1) { collection = parts[1]; } if (LOG.isInfoEnabled()) { LOG.info("Importing ARC: " + arcUrl); } ArchiveReader r = ArchiveReaderFactory.get(arcUrl); r.setDigest(true); ArcReader reader = new ArcReader(r); try { // XXX : Rewritten to accomodate HDFS sequencefile writing String prefix = seqFilePrefix; String suffix = seqFileSuffix; String outfilepath = seqFilePath; Configuration conf = getConfiguration(); FileSystem fs = getFileSystem(outfilepath, conf); String arcfilename = prefix + arcUrl.substring(arcUrl.lastIndexOf("/") + 1) + suffix; Path path = new Path(outfilepath + "/" + arcfilename); if (fs.exists(path)) { fs.delete(path, true); } Writer writer = SequenceFile.createWriter(fs, conf, path, new Text().getClass(), new Text().getClass()); for (ARCRecord record : reader) { // When reading WARC files, records of type other than // "response" are returned as 'null' by the Iterator, so // we skip them. if (record == null) { continue; } importRecord(record, segmentName, collection, output, writer); reporter.progress(); } writer.close(); System.out.println("Finished processing " + arcfilename); /* * //test reading Reader seqreader = new Reader(fs, path, conf); * Text key_r = new Text(); Text val_r = new Text(); * * while (seqreader.next(key_r, val_r)) { System.out.println(key_r + * "\t len: " + val_r.getLength()); } */ success = true; } catch (Exception e) { LOG.warn("Error processing archive file: " + arcUrl, e); if (jobConf.getBoolean("nutchwax.import.abortOnArchiveReadError", false)) { throw new IOException(e); } } finally { r.close(); if (System.getProperty("fullPathExecution").equals("true")) { if (!success) { add2ProceccedFailedFile(arcUrl); } add2ProcessedFile(arcUrl); } if (LOG.isInfoEnabled()) { LOG.info("Completed ARC: " + arcUrl); } } } private static Configuration getConfiguration() { Configuration conf = new Configuration(); conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); return conf; } private FileSystem getFileSystem(String outfilepath, Configuration conf) { try { if (outfilepath.startsWith("hdfs://")) { return FileSystem.get(new URI(outfilepath), conf); } else { return FileSystem.get(conf); } } catch (IOException e) { LOG.error(e.getMessage()); throw new RuntimeException(e); } catch (URISyntaxException e) { LOG.error(e.getMessage()); throw new RuntimeException(e); } } /** * Import an ARCRecord. * * @param record * @param segmentName * @param collectionName * @param output * @return whether record was imported or not (i.e. filtered out due to URL * filtering rules, etc.) */ private boolean importRecord(ARCRecord record, String segmentName, String collectionName, OutputCollector output, Writer writer) { ARCRecordMetaData meta = record.getMetaData(); if (LOG.isInfoEnabled()) { LOG.info("Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ") [" + meta.getLength() + "]"); } if (!this.httpStatusCodeFilter.isAllowed(record.getStatusCode())) { if (LOG.isInfoEnabled()) { LOG.info("Skip URL: " + meta.getUrl() + " HTTP status:" + record.getStatusCode()); } return false; } try { // Skip the HTTP headers in the response body, so that the // parsers are parsing the reponse body and not the HTTP // headers. record.skipHttpHeader(); // We use record.available() rather than meta.getLength() // because the latter includes the size of the HTTP header, // which we just skipped. byte[] bytes = readBytes(record, record.available()); // If there is no digest, then we assume we're reading an // ARCRecord not a WARCRecord. In that case, we close the // record, which updates the digest string. Then we tweak the // digest string so we have the same for for both ARC and WARC // records. if (meta.getDigest() == null) { record.close(); // This is a bit hacky, but ARC and WARC records produce // two slightly different digest formats. WARC record // digests have the algorithm name as a prefix, such as // "sha1:PD3SS4WWZVFWTDC63RU2MWX7BVC2Y2VA" but the // ArcRecord.getDigestStr() does not. Since we want the // formats to match, we prepend the "sha1:" prefix to ARC // record digest. meta.setDigest("sha1:" + record.getDigestStr()); } // Normalize and filter String url = this.normalizeAndFilterUrl(meta.getUrl(), meta.getDigest(), meta.getDate()); if (url == null) { if (LOG.isInfoEnabled()) { LOG.info("Skip URL: " + meta.getUrl()); } return false; } // We create a key which combines the URL and digest values. // This is necessary because Nutch stores all the data in // MapFiles, which are basically just {key,value} pairs. // // If we use just the URL as the key (which is the way Nutch // usually works) then we have problems with multiple, // different copies of the same URL. If we try and store two // different copies of the same URL (each having a different // digest) and only use the URL as the key, when the MapFile // is written, only *one* copy of the page will be stored. // // Think about it, we're basically doing: // MapFile.put( url, value1 ); // MapFile.put( url, value2 ); // Only one of those url,value mappings will keep, the other // is over-written. // // So, by using the url+digest as the key, we can have all the // data stored. The only problem is all over in Nutch where // the key==url is assumed :( String key = url + " " + meta.getDigest(); Metadata contentMetadata = new Metadata(); // Set the segment name, just as is done by standard Nutch fetching. // Then, add the NutchWAX-specific metadata fields. contentMetadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // We store both the normal URL and the URL+digest key for // later retrieval by the indexing plugin(s). contentMetadata.set(NutchWax.URL_KEY, url); // contentMetadata.set( NutchWax.ORIG_KEY, key ); contentMetadata.set(NutchWax.FILENAME_KEY, meta.getArcFile().getName()); contentMetadata.set(NutchWax.FILEOFFSET_KEY, String.valueOf(record.getHeader().getOffset())); contentMetadata.set(NutchWax.COLLECTION_KEY, collectionName); contentMetadata.set(NutchWax.DATE_KEY, meta.getDate()); contentMetadata.set(NutchWax.DIGEST_KEY, meta.getDigest()); contentMetadata.set(NutchWax.CONTENT_TYPE_KEY, meta.getMimetype()); contentMetadata.set(NutchWax.CONTENT_LENGTH_KEY, String.valueOf(meta.getLength())); contentMetadata.set(NutchWax.HTTP_RESPONSE_KEY, String.valueOf(record.getStatusCode())); Content content = new Content(url, url, bytes, meta.getMimetype(), contentMetadata, getConf()); // ----------------- // write to seqencefile byte[] contentInOctets = content.getContent(); String htmlraw = new String(); // meta only contains char encodings // LOG.info("Metadata count: " + contentMetadata.names().length); // for (String name : contentMetadata.names()){ // LOG.info("meta " + name + " : " + contentMetadata.get(name)); // } // try getting content encoding try { htmlraw = new String(contentInOctets, contentMetadata.get("OriginalCharEncoding")); } catch (Exception e) { LOG.warn("could not get content with OriginalCharEncoding"); } // if unable, try utf-8 if (htmlraw.length() == 0) { try { htmlraw = new String(contentInOctets, "UTF-8"); } catch (UnsupportedEncodingException e) { LOG.error("unable to convert content into string"); } } URL url_h = null; try { url_h = new URL(content.getUrl()); } catch (MalformedURLException e1) { LOG.error("Malformed URL Exception: " + e1.getMessage()); } String protocol = url_h.getProtocol(); String hostname = url_h.getHost(); String urlpath = url_h.getPath(); String param = url_h.getQuery(); //LOG.info("HOST:" + hostname); //LOG.info("PATH:" + urlpath); //LOG.info("PROTOCOL:" + protocol); //LOG.info("PARAM: " + param); String date = meta.getDate(); // LOG.info("meta date: " + date); Text key_h = new Text(protocol + "::" + hostname + "::" + urlpath + "::" + param + "::" + date); Text value = new Text(htmlraw); try { LOG.info("len: " + writer.getLength() + ", key: " + key_h + ", value len: " + value.getLength()); writer.append(key_h, value); } catch (IOException e) { LOG.error("SequenceFile IOException: " + e.getMessage()); } // ----------------- output(output, new Text(key), content); return true; } catch (Throwable t) { LOG.error("Import fail : " + meta.getUrl(), t); } return false; } /** * Normalize and filter the URL. If the URL is malformed or filtered * (according to registered Nutch URL filtering plugins), return * <code>null</code>. Otherwise return the normalized URL. * * @param candidateUrl * to be normalized and filtered * @param digest * of URL content * @param date * of URL capture * @return normalized URL, <code>null</code> if malformed or filtered out */ private String normalizeAndFilterUrl(String candidateUrl, String digest, String date) { String url = null; try { url = normalizers.normalize(candidateUrl, URLNormalizers.SCOPE_FETCHER); if (urlFilters.filter(url + " " + digest + " " + date) != null) { return url; } } catch (MalformedURLException mue) { if (LOG.isInfoEnabled()) { LOG.info("MalformedURL: " + candidateUrl); } } catch (URLFilterException ufe) { if (LOG.isInfoEnabled()) { LOG.info("URL filtered: " + candidateUrl); } } return null; } /** * Writes the key and related content to the output collector. The division * between <code>importRecord</code> and <code>output</code> is merely based * on the way the code was structured in the * <code>ArcSegmentCreator.java</code> which was used as a starting-point * for this class. */ private void output(OutputCollector output, Text key, Content content) { LOG.debug("output( " + key + " )"); // Create the crawl datum. This CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_FETCH_SUCCESS, this.interval, 1.0f); // ?: I have no idea why we need to store the ProtocolStatus in // the datum's metadata, but the Nutch Fetcher class does it and // it seems important. Since we're not really fetching here, we // assume ProtocolStatus.STATUS_SUCCESS is the right thing to do. datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, ProtocolStatus.STATUS_SUCCESS); // ?: Since we store the ARCRecord's archival date in the Content // object, we follow the // logic in Nutch's Fetcher and store the current import time/date in // the Datum. I have // no idea if it makes a difference, other than this value is stored in // the "tstamp" // field in the Lucene index whereas the ARCRecord date is stored in the // "date" field // we added above. datum.setFetchTime(System.currentTimeMillis()); // ?: It doesn't seem to me that we need/use the scoring stuff // one way or another, but we might as well leave it in. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score before parsing for: " + key, e); } } // ?: This is kind of interesting. In the Nutch Fetcher class, if the // parsing fails, // the Content is not added to the output. But in Importer, we still add // it, even // if the parsing fails. Why? // // One benefit is that even if the parsing fails, having the Content in // the index still // allows us to find the document by URL, date, etc. // // However, I don't know what will happen when a summary is // computed...if the Content isn't there, will // it fail or just return an empty summary? ParseResult parseResult = null; try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key, e); } // ?: This is taken from Nutch Fetcher. I believe the signatures are // used in the Fetcher // to ensure that URL contents are not stored multiple times if the // signature doesn't change. // Makes sense. But, in our case, we're relying on the (W)ARC production // tools to eliminate // duplicate data (or are we?), so how important is the signature for // our purposes? // I'll go ahead and leave it in, in case it's needed by Nutch for // unknown purposes. // // Also, since we still import documents even if the parsing fails, we // compute a signature // using an "empty" Parse object in the case of parse failure. I don't // know why we create // an empty Parse object rather than just use 'null', but I'm copying // the way the Fetcher // does it. // // One odd thing is that we add the signature to the datum here, then // "collect" the datum // just below, but then after collecting the datum, we update the // signature when processing // the ParseResults. I guess "collecting" doesn't write out the datum, // but "collects" it for // later output, thus we can update it after collection (I guess). if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, new ParseStatus().getEmptyParse(getConf())); datum.setSignature(signature); } try { // Some weird problem with Hadoop 0.19.x - when the crawl_data // is merged during the reduce step, the classloader cannot // find the org.apache.nutch.protocol.ProtocolStatus class. // // We avoid the whole issue by omitting the crawl_data all // together, which we don't use anyways. // // output.collect( key, new NutchWritable( datum ) ); if (jobConf.getBoolean("nutchwax.import.store.content", false)) { output.collect(key, new NutchWritable(content)); } if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); } byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); // ?: Why bother setting this one again? According to // ParseData Javadoc, // the getContentMeta() returns the original Content // metadata object, so // why are we setting the segment name on it to the same // value again? // Let's leave it out. // parse.getData().getContentMeta().set( // Nutch.SEGMENT_NAME_KEY, segmentName ); // ?: These two are copied from Nutch's Fetcher // implementation. parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); // ?: What is this all about? It was in the original // ArcSegmentCreator.java that // inspired this code. But I can't figure out why we need // it. If anything // this will always be false since our key is now // URL+digest, not just URL. // Since it's always false, let's leave it out. /* * if ( url.equals( key ) ) { datum.setSignature( signature * ); } else { if ( LOG.isWarnEnabled() ) LOG.warn( * "ParseResult entry key and url differ: key=" + key + * " url=" + url ); } */ // ?: As above, we'll leave the scoring hooks in place. try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url = " + key, e); } } output.collect(key, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical()))); } } } catch (Exception e) { LOG.error("Error outputting Nutch record for: " + key, e); } } /** * Utility method to read the content bytes from an archive record. The * number of bytes read can be limited via the configuration property * <code>nutchwax.import.content.limit</code>. */ private byte[] readBytes(ARCRecord record, long contentLength) throws IOException { // Ensure the record does strict reading. record.setStrict(true); long size = jobConf.getLong("nutchwax.import.content.limit", -1); if (size < 0) { size = contentLength; } else { size = Math.min(size, contentLength); } // Read the bytes of the HTTP response byte[] bytes = new byte[(int) size]; if (size == 0) { return bytes; } // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT // over-ride // the implementation inherited from InputStream. And since it does // not over-ride it, it won't do the digesting on it. Must use either // read(byte[],offset,length) or read(). int pos = 0; while ((pos += record.read(bytes, pos, (bytes.length - pos))) < bytes.length) ; // Now that the bytes[] buffer has been filled, read the remainder // of the record so that the digest is computed over the entire // content. byte[] buf = new byte[1024 * 1024]; int count = 0; while (record.available() > 0) { count += record.read(buf, 0, Math.min(buf.length, record.available())); } if (LOG.isInfoEnabled()) { LOG.info("Bytes read: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count); } // Sanity check. The number of bytes read into our bytes[] // buffer, plus the count of extra stuff read after it should // equal the contentLength passed into this function. if (pos + count != contentLength) { throw new IOException("Incorrect number of bytes read from ArchiveRecord: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count); } return bytes; } /** * Runs the import job with the given arguments. This method assumes that is * is being run via the command-line; as such, it emits error messages * regarding invalid/missing arguments to the system error stream. */ public int run(String[] args) throws Exception { if (args.length < 1) { usage(); return -1; } JobConf job = new NutchJob(getConf()); System.setProperty("fullPathExecution", "false"); Path manifestPath = null; // Check for "-e <exclusions>" option & "-p <path_to_warc_files>" option. int pos = 0; for (String[] str : getOptsList(args)) { if (args.length < pos + 2) { System.out.println("ERROR: Missing filename for option \"" + str[0] + "\"\n"); usage(); return -1; } if (str[0].equals("-p")) { manifestPath = new Path(getManifestFile(str[1])); System.setProperty("fullPathExecution", "true"); } else if (str[0].equals("-e")) { job.set("nutchwax.urlfilter.wayback.exclusions", str[1]); } pos = pos + 2; } if (manifestPath == null) { if (args.length - pos < 1) { System.out.println("ERROR: Missing manifest file.\n"); usage(); return -1; } else { manifestPath = new Path(args[pos++]); } } Path segmentPath; if (args.length - pos < 1) { segmentPath = new Path("segments", org.apache.nutch.crawl.Generator.generateSegmentName()); } else { segmentPath = new Path(args[pos]); } try { job.setJobName("Importer_to_Hdfs " + manifestPath); job.set(Nutch.SEGMENT_NAME_KEY, segmentPath.getName()); // job.setInputPath ( manifestPath); FileInputFormat.addInputPath(job, manifestPath); job.setInputFormat(TextInputFormat.class); job.setMapperClass(ImporterToHdfs.class); // job.setOutputPath ( segmentPath ); FileOutputFormat.setOutputPath(job, segmentPath); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); RunningJob rj = JobClient.runJob(job); return rj.isSuccessful() ? 0 : 1; } catch (Exception e) { LOG.fatal("Importer_to_Hdfs: ", e); System.out.println("Fatal error: " + e); e.printStackTrace(System.out); return -1; } } private List<String[]> getOptsList(String[] args) { List<String[]> list = new ArrayList<String[]>(); if (hasArgument(args[0])) { list.add(new String[] { args[0], args.length >= 2 ? args[1] : "" }); } if (args.length >= 3 && hasArgument(args[2])) { list.add(new String[] { args[2], args.length >= 4 ? args[3] : "" }); } return list; } private boolean hasArgument(String val) { return val.equals("-p") || val.equals("-e"); } private String getManifestFile(String path) throws IOException { File processedFiles = getProcessedFile(path); File manifestFile = getManifestFileLocation(path); List<String> files = getFiles(path, processedFiles); BufferedWriter output = null; try { output = new BufferedWriter(new FileWriter(manifestFile)); for (String f : files) { output.write(f); output.write("\n"); } } catch (Exception e) { System.out.println("Fatal error: " + e); e.printStackTrace(System.out); } finally { if (output != null) { output.close(); } } return manifestFile.getAbsolutePath(); } private List<String> getFiles(String dirPath, File processedFiles) { List<String> processed = getProcessedFiles(processedFiles.getAbsolutePath()); List<String> list = new ArrayList<String>(); for (String file : getFilesList(dirPath)) { if (file.endsWith(".warc") && !processed.contains(file)) { list.add(file); } } return list; } private List<String> getFilesList(String dirPath) { File path = new File(dirPath); List<String> listFiles = new ArrayList<String>(); for (File file : path.listFiles()) { listFiles.add(file.getAbsolutePath()); } return listFiles; } private List<String> getProcessedFiles(String path) { List<String> processed = new ArrayList<String>(); try { java.nio.file.Path file = Paths.get(path); processed = Files.readAllLines(file, StandardCharsets.UTF_8); } catch (IOException e) { System.out.println("Fatal error: " + e); e.printStackTrace(System.out); } return processed; } private File getManifestFileLocation(String path) { if (getManifestPath() != null && !getManifestPath().isEmpty()) { path = getManifestPath(); } if (!path.endsWith("/")) { path = path + "/"; } path = path + "manifest.txt"; return new File(path); } private void add2ProcessedFile(String warcFile) { add2File(warcFile, getProcessedFile(warcFile.substring(0, warcFile.lastIndexOf("/")))); } private void add2ProceccedFailedFile(String warcFile) { add2File(warcFile, getProcessedFailedFile(warcFile.substring(0, warcFile.lastIndexOf("/")))); } private void add2File(String warcFile, File path) { BufferedWriter output = null; try { output = new BufferedWriter(new FileWriter(path, true)); output.write(warcFile); output.write("\n"); } catch (IOException e) { System.out.println("Fatal error: " + e); e.printStackTrace(System.out); } finally { if (output != null) { try { output.close(); } catch (IOException e) { System.out.println("Fatal error: " + e); e.printStackTrace(System.out); } } } } private File getProcessedFile(String path) { return getFile(path, ".processed"); } private File getProcessedFailedFile(String path) { return getFile(path, ".processingfailed"); } private File getFile(String path, String filename) { if (getProcessedPath() != null && !getProcessedPath().isEmpty()) { path = getProcessedPath(); } if (!path.endsWith("/")) { path = path + "/"; } path = path + filename; File file = new File(path); if (!file.exists()) { try { file.createNewFile(); } catch (IOException e) { System.out.println("Fatal error: " + e); e.printStackTrace(System.out); } } return file; } private String getManifestPath() { return getConf().get("nutchwax.importer.hdfs.manifestPath"); } private String getProcessedPath() { return getConf().get("nutchwax.importer.hdfs.processedPath"); } /** * Emit usage information for command-line driver. */ public void usage() { String usage = "Usage: import_to_hdfs [opts] [<manifest>] [<segment>]\n" + "Options:\n\n" + " -p path warc files location, generates manifest file automatically.\n\n" + "Created manifest file location is specified in configuration.\n" + "Manifest file will be genereted in <path> directory if configuration missing manifest file specification.\n" + "Argument <manifest> can't be used if <path> is specified.\n" + "If <path> not added, argument <manifest> has to be specified.\n" + "\n" + " -e filename Exclusions file, over-rides configuration property.\n" + "\n" + "If <segment> not specified, a pathname will be automatically generated\n" + "based on current time in sub-directory 'segments', which is created if\n" + "necessary. This is to mirror the behavior of other Nutch actions.\n"; System.out.println(usage); } /** * Command-line driver. Runs the Importer as a Hadoop job. */ public static void main(String args[]) throws Exception { int result = ToolRunner.run(NutchConfiguration.create(), new ImporterToHdfs(), args); System.exit(result); } } /** * This should all be moved into some sort of filtering plugin. Unfortunately * the URLFilter plugin interface isn't adequate as it only looks at a URL * string. Rather than jamming a response code through that interface, we do a * one-off filter class here. * * A long-term solution would be to create a new Nutch extension point interface * that takes an ARCRecord rather than a URL string. That way we can write * filters that can operate on any part of an ARCRecord, not just the URL. */ class HTTPStatusCodeFilter { List<Range> ranges = new ArrayList<Range>(); public HTTPStatusCodeFilter(String configuration) { if (configuration == null) { return; } configuration = configuration.trim(); for (String value : configuration.split("\\s+")) { Range range = new Range(); // Special handling for "unknown" where an ARCRecord doesn't have // an HTTP status code. The ARCRecord.getStatusCode() returns // -1 in that case, so we make a range for it. if (value.toLowerCase().equals("unknown")) { range.lower = -1; range.upper = -1; this.ranges.add(range); continue; } String values[] = value.split("[-]"); try { switch (values.length) { case 2: // It's a range, N-M range.lower = Integer.parseInt(values[0]); range.upper = Integer.parseInt(values[1]); break; case 1: // It's a single value, convert to a single-value range range.lower = Integer.parseInt(values[0]); range.upper = range.lower; break; default: // Bad format ImporterToHdfs.LOG.warn("Illegal format for nutchwax.filter.http.status: " + range); continue; } this.ranges.add(range); } catch (NumberFormatException nfe) { ImporterToHdfs.LOG.warn("Illegal format for nutchwax.filter.http.status: " + range, nfe); } } } public boolean isAllowed(int code) { for (Range r : this.ranges) { return (r.lower <= code && code <= r.upper); } return false; } static class Range { int lower; int upper; } }