Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.crawl; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.io.Closeable; import java.lang.invoke.MethodHandles; import java.net.URL; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.TreeMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.tdunning.math.stats.MergingDigest; import com.tdunning.math.stats.TDigest; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.MapFile; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.StringUtils; import org.apache.nutch.util.AbstractChecker; import org.apache.nutch.util.JexlUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.SegmentReaderUtil; import org.apache.nutch.util.StringUtil; import org.apache.nutch.util.TimingUtil; import org.apache.commons.jexl2.Expression; /** * Read utility for the CrawlDB. * * @author Andrzej Bialecki * */ public class CrawlDbReader extends AbstractChecker implements Closeable { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private MapFile.Reader[] readers = null; protected String crawlDb; private void openReaders(String crawlDb, Configuration config) throws IOException { if (readers != null) return; Path crawlDbPath = new Path(crawlDb, CrawlDb.CURRENT_NAME); readers = MapFileOutputFormat.getReaders(crawlDbPath, config); } private void closeReaders() { if (readers == null) return; for (int i = 0; i < readers.length; i++) { try { readers[i].close(); } catch (Exception e) { } } readers = null; } public static class CrawlDatumCsvOutputFormat extends FileOutputFormat<Text, CrawlDatum> { protected static class LineRecordWriter extends RecordWriter<Text, CrawlDatum> { private DataOutputStream out; public LineRecordWriter(DataOutputStream out) { this.out = out; try { out.writeBytes( "Url,Status code,Status name,Fetch Time,Modified Time,Retries since fetch,Retry interval seconds,Retry interval days,Score,Signature,Metadata\n"); } catch (IOException e) { } } public synchronized void write(Text key, CrawlDatum value) throws IOException { out.writeByte('"'); out.writeBytes(key.toString()); out.writeByte('"'); out.writeByte(','); out.writeBytes(Integer.toString(value.getStatus())); out.writeByte(','); out.writeByte('"'); out.writeBytes(CrawlDatum.getStatusName(value.getStatus())); out.writeByte('"'); out.writeByte(','); out.writeBytes(new Date(value.getFetchTime()).toString()); out.writeByte(','); out.writeBytes(new Date(value.getModifiedTime()).toString()); out.writeByte(','); out.writeBytes(Integer.toString(value.getRetriesSinceFetch())); out.writeByte(','); out.writeBytes(Float.toString(value.getFetchInterval())); out.writeByte(','); out.writeBytes(Float.toString((value.getFetchInterval() / FetchSchedule.SECONDS_PER_DAY))); out.writeByte(','); out.writeBytes(Float.toString(value.getScore())); out.writeByte(','); out.writeByte('"'); out.writeBytes( value.getSignature() != null ? StringUtil.toHexString(value.getSignature()) : "null"); out.writeByte('"'); out.writeByte(','); out.writeByte('"'); if (value.getMetaData() != null) { for (Entry<Writable, Writable> e : value.getMetaData().entrySet()) { out.writeBytes(e.getKey().toString()); out.writeByte(':'); out.writeBytes(e.getValue().toString()); out.writeBytes("|||"); } } out.writeByte('"'); out.writeByte('\n'); } public synchronized void close(TaskAttemptContext context) throws IOException { out.close(); } } public RecordWriter<Text, CrawlDatum> getRecordWriter(TaskAttemptContext context) throws IOException { String name = getUniqueFile(context, "part", ""); Path dir = FileOutputFormat.getOutputPath(context); FileSystem fs = dir.getFileSystem(context.getConfiguration()); DataOutputStream fileOut = fs.create(new Path(dir, name), context); return new LineRecordWriter(fileOut); } } public static class CrawlDbStatMapper extends Mapper<Text, CrawlDatum, Text, NutchWritable> { NutchWritable COUNT_1 = new NutchWritable(new LongWritable(1)); private boolean sort = false; @Override public void setup(Mapper<Text, CrawlDatum, Text, NutchWritable>.Context context) { Configuration conf = context.getConfiguration(); sort = conf.getBoolean("db.reader.stats.sort", false); } @Override public void map(Text key, CrawlDatum value, Context context) throws IOException, InterruptedException { context.write(new Text("T"), COUNT_1); context.write(new Text("status " + value.getStatus()), COUNT_1); context.write(new Text("retry " + value.getRetriesSinceFetch()), COUNT_1); if (Float.isNaN(value.getScore())) { context.write(new Text("scNaN"), COUNT_1); } else { NutchWritable score = new NutchWritable(new FloatWritable(value.getScore())); context.write(new Text("sc"), score); context.write(new Text("sct"), score); context.write(new Text("scd"), score); } // fetch time (in minutes to prevent from overflows when summing up) NutchWritable fetchTime = new NutchWritable(new LongWritable(value.getFetchTime() / (1000 * 60))); context.write(new Text("ft"), fetchTime); context.write(new Text("ftt"), fetchTime); // fetch interval (in seconds) NutchWritable fetchInterval = new NutchWritable(new LongWritable(value.getFetchInterval())); context.write(new Text("fi"), fetchInterval); context.write(new Text("fit"), fetchInterval); if (sort) { URL u = new URL(key.toString()); String host = u.getHost(); context.write(new Text("status " + value.getStatus() + " " + host), COUNT_1); } } } public static class CrawlDbStatReducer extends Reducer<Text, NutchWritable, Text, NutchWritable> { public void setup(Reducer<Text, NutchWritable, Text, NutchWritable>.Context context) { } @Override public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException { String k = key.toString(); if (k.equals("T") || k.startsWith("status") || k.startsWith("retry") || k.equals("ftt") || k.equals("fit")) { // sum all values for this key long sum = 0; for (NutchWritable value : values) { sum += ((LongWritable) value.get()).get(); } // output sum context.write(key, new NutchWritable(new LongWritable(sum))); } else if (k.equals("sc")) { float min = Float.MAX_VALUE; float max = Float.MIN_VALUE; for (NutchWritable nvalue : values) { float value = ((FloatWritable) nvalue.get()).get(); if (max < value) { max = value; } if (min > value) { min = value; } } context.write(key, new NutchWritable(new FloatWritable(min))); context.write(key, new NutchWritable(new FloatWritable(max))); } else if (k.equals("ft") || k.equals("fi")) { long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; for (NutchWritable nvalue : values) { long value = ((LongWritable) nvalue.get()).get(); if (max < value) { max = value; } if (min > value) { min = value; } } context.write(key, new NutchWritable(new LongWritable(min))); context.write(key, new NutchWritable(new LongWritable(max))); } else if (k.equals("sct")) { float cnt = 0.0f; for (NutchWritable nvalue : values) { float value = ((FloatWritable) nvalue.get()).get(); cnt += value; } context.write(key, new NutchWritable(new FloatWritable(cnt))); } else if (k.equals("scd")) { MergingDigest tdigest = null; for (NutchWritable nvalue : values) { Writable value = nvalue.get(); if (value instanceof BytesWritable) { byte[] bytes = ((BytesWritable) value).getBytes(); MergingDigest tdig = MergingDigest.fromBytes(ByteBuffer.wrap(bytes)); if (tdigest == null) { tdigest = tdig; } else { tdigest.add(tdig); } } else if (value instanceof FloatWritable) { float val = ((FloatWritable) value).get(); if (!Float.isNaN(val)) { if (tdigest == null) { tdigest = (MergingDigest) TDigest.createMergingDigest(100.0); } tdigest.add(val); } } } ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize()); tdigest.asSmallBytes(tdigestBytes); context.write(key, new NutchWritable(new BytesWritable(tdigestBytes.array()))); } } } public static class CrawlDbTopNMapper extends Mapper<Text, CrawlDatum, FloatWritable, Text> { private static final FloatWritable fw = new FloatWritable(); private float min = 0.0f; @Override public void setup(Mapper<Text, CrawlDatum, FloatWritable, Text>.Context context) { Configuration conf = context.getConfiguration(); min = conf.getFloat("db.reader.topn.min", 0.0f); } @Override public void map(Text key, CrawlDatum value, Context context) throws IOException, InterruptedException { if (value.getScore() < min) return; // don't collect low-scoring records fw.set(-value.getScore()); // reverse sorting order context.write(fw, key); // invert mapping: score -> url } } public static class CrawlDbTopNReducer extends Reducer<FloatWritable, Text, FloatWritable, Text> { private long topN; private long count = 0L; @Override public void reduce(FloatWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { for (Text value : values) { if (count < topN) { key.set(-key.get()); context.write(key, value); count++; } } } @Override public void setup(Reducer<FloatWritable, Text, FloatWritable, Text>.Context context) { Configuration conf = context.getConfiguration(); topN = conf.getLong("db.reader.topn", 100) / Integer.parseInt(conf.get("mapreduce.job.reduces")); } } public void close() { closeReaders(); } private TreeMap<String, Writable> processStatJobHelper(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException { Path tmpFolder = new Path(crawlDb, "stat_tmp" + System.currentTimeMillis()); Job job = NutchJob.getInstance(config); config = job.getConfiguration(); job.setJobName("stats " + crawlDb); config.setBoolean("db.reader.stats.sort", sort); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setJarByClass(CrawlDbReader.class); job.setMapperClass(CrawlDbStatMapper.class); job.setCombinerClass(CrawlDbStatReducer.class); job.setReducerClass(CrawlDbStatReducer.class); FileOutputFormat.setOutputPath(job, tmpFolder); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); // https://issues.apache.org/jira/browse/NUTCH-1029 config.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); FileSystem fileSystem = tmpFolder.getFileSystem(config); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "CrawlDbReader job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); fileSystem.delete(tmpFolder, true); throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error(StringUtils.stringifyException(e)); fileSystem.delete(tmpFolder, true); throw e; } // reading the result SequenceFile.Reader[] readers = SegmentReaderUtil.getReaders(tmpFolder, config); Text key = new Text(); NutchWritable value = new NutchWritable(); TreeMap<String, Writable> stats = new TreeMap<>(); for (int i = 0; i < readers.length; i++) { SequenceFile.Reader reader = readers[i]; while (reader.next(key, value)) { String k = key.toString(); Writable val = stats.get(k); if (val == null) { stats.put(k, value.get()); continue; } if (k.equals("sc")) { float min = Float.MAX_VALUE; float max = Float.MIN_VALUE; if (stats.containsKey("scn")) { min = ((FloatWritable) stats.get("scn")).get(); } else { min = ((FloatWritable) stats.get("sc")).get(); } if (stats.containsKey("scx")) { max = ((FloatWritable) stats.get("scx")).get(); } else { max = ((FloatWritable) stats.get("sc")).get(); } float fvalue = ((FloatWritable) value.get()).get(); if (min > fvalue) { min = fvalue; } if (max < fvalue) { max = fvalue; } stats.put("scn", new FloatWritable(min)); stats.put("scx", new FloatWritable(max)); } else if (k.equals("ft") || k.equals("fi")) { long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; String minKey = k + "n"; String maxKey = k + "x"; if (stats.containsKey(minKey)) { min = ((LongWritable) stats.get(minKey)).get(); } else if (stats.containsKey(k)) { min = ((LongWritable) stats.get(k)).get(); } if (stats.containsKey(maxKey)) { max = ((LongWritable) stats.get(maxKey)).get(); } else if (stats.containsKey(k)) { max = ((LongWritable) stats.get(k)).get(); } long lvalue = ((LongWritable) value.get()).get(); if (min > lvalue) { min = lvalue; } if (max < lvalue) { max = lvalue; } stats.put(k + "n", new LongWritable(min)); stats.put(k + "x", new LongWritable(max)); } else if (k.equals("sct")) { FloatWritable fvalue = (FloatWritable) value.get(); ((FloatWritable) val).set(((FloatWritable) val).get() + fvalue.get()); } else if (k.equals("scd")) { MergingDigest tdigest = null; MergingDigest tdig = MergingDigest .fromBytes(ByteBuffer.wrap(((BytesWritable) value.get()).getBytes())); if (val instanceof BytesWritable) { tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(((BytesWritable) val).getBytes())); tdigest.add(tdig); } else { tdigest = tdig; } ByteBuffer tdigestBytes = ByteBuffer.allocate(tdigest.smallByteSize()); tdigest.asSmallBytes(tdigestBytes); stats.put(k, new BytesWritable(tdigestBytes.array())); } else { LongWritable lvalue = (LongWritable) value.get(); ((LongWritable) val).set(((LongWritable) val).get() + lvalue.get()); } } reader.close(); } // remove score, fetch interval, and fetch time // (used for min/max calculation) stats.remove("sc"); stats.remove("fi"); stats.remove("ft"); // removing the tmp folder fileSystem.delete(tmpFolder, true); return stats; } public void processStatJob(String crawlDb, Configuration config, boolean sort) throws IOException, InterruptedException, ClassNotFoundException { double quantiles[] = { .01, .05, .1, .2, .25, .3, .4, .5, .6, .7, .75, .8, .9, .95, .99 }; if (config.get("db.stats.score.quantiles") != null) { List<Double> qs = new ArrayList<>(); for (String s : config.getStrings("db.stats.score.quantiles")) { try { double d = Double.parseDouble(s); if (d >= 0.0 && d <= 1.0) { qs.add(d); } else { LOG.warn("Skipping quantile {} not in range in db.stats.score.quantiles: {}", s); } } catch (NumberFormatException e) { LOG.warn("Skipping bad floating point number {} in db.stats.score.quantiles: {}", s, e.getMessage()); } quantiles = new double[qs.size()]; int i = 0; for (Double q : qs) { quantiles[i++] = q; } Arrays.sort(quantiles); } } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics start: " + crawlDb); } TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, config, sort); if (LOG.isInfoEnabled()) { LOG.info("Statistics for CrawlDb: " + crawlDb); LongWritable totalCnt = new LongWritable(0); if (stats.containsKey("T")) { totalCnt = ((LongWritable) stats.get("T")); stats.remove("T"); } LOG.info("TOTAL urls:\t" + totalCnt.get()); for (Map.Entry<String, Writable> entry : stats.entrySet()) { String k = entry.getKey(); long value = 0; double fvalue = 0.0; byte[] bytesValue = null; Writable val = entry.getValue(); if (val instanceof LongWritable) { value = ((LongWritable) val).get(); } else if (val instanceof FloatWritable) { fvalue = ((FloatWritable) val).get(); } else if (val instanceof BytesWritable) { bytesValue = ((BytesWritable) val).getBytes(); } if (k.equals("scn")) { LOG.info("min score:\t" + fvalue); } else if (k.equals("scx")) { LOG.info("max score:\t" + fvalue); } else if (k.equals("sct")) { LOG.info("avg score:\t" + (fvalue / totalCnt.get())); } else if (k.equals("scNaN")) { LOG.info("score == NaN:\t" + value); } else if (k.equals("ftn")) { LOG.info("earliest fetch time:\t" + new Date(1000 * 60 * value)); } else if (k.equals("ftx")) { LOG.info("latest fetch time:\t" + new Date(1000 * 60 * value)); } else if (k.equals("ftt")) { LOG.info("avg of fetch times:\t" + new Date(1000 * 60 * (value / totalCnt.get()))); } else if (k.equals("fin")) { LOG.info("shortest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value)); } else if (k.equals("fix")) { LOG.info("longest fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value)); } else if (k.equals("fit")) { LOG.info("avg fetch interval:\t{}", TimingUtil.secondsToDaysHMS(value / totalCnt.get())); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) LOG.info(" " + st[2] + " :\t" + val); else LOG.info(st[0] + " " + code + " (" + CrawlDatum.getStatusName((byte) code) + "):\t" + val); } else if (k.equals("scd")) { MergingDigest tdigest = MergingDigest.fromBytes(ByteBuffer.wrap(bytesValue)); for (double q : quantiles) { LOG.info("score quantile {}:\t{}", q, tdigest.quantile(q)); } } else { LOG.info(k + ":\t" + val); } } } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb statistics: done"); } } public CrawlDatum get(String crawlDb, String url, Configuration config) throws IOException { Text key = new Text(url); CrawlDatum val = new CrawlDatum(); openReaders(crawlDb, config); CrawlDatum res = (CrawlDatum) MapFileOutputFormat.getEntry(readers, new HashPartitioner<>(), key, val); return res; } @Override protected int process(String line, StringBuilder output) throws Exception { Job job = NutchJob.getInstance(getConf()); Configuration config = job.getConfiguration(); // Close readers, so we know we're not working on stale data closeReaders(); readUrl(this.crawlDb, line, config, output); return 0; } public void readUrl(String crawlDb, String url, Configuration config, StringBuilder output) throws IOException { CrawlDatum res = get(crawlDb, url, config); output.append("URL: " + url + "\n"); if (res != null) { output.append(res); } else { output.append("not found"); } output.append("\n"); } public void processDumpJob(String crawlDb, String output, Configuration config, String format, String regex, String status, Integer retry, String expr, Float sample) throws IOException, ClassNotFoundException, InterruptedException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: starting"); LOG.info("CrawlDb db: " + crawlDb); } Path outFolder = new Path(output); Job job = NutchJob.getInstance(config); job.setJobName("dump " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, outFolder); if (format.equals("csv")) { job.setOutputFormatClass(CrawlDatumCsvOutputFormat.class); } else if (format.equals("crawldb")) { job.setOutputFormatClass(MapFileOutputFormat.class); } else { job.setOutputFormatClass(TextOutputFormat.class); } if (status != null) config.set("status", status); if (regex != null) config.set("regex", regex); if (retry != null) config.setInt("retry", retry); if (expr != null) { config.set("expr", expr); LOG.info("CrawlDb db: expr: " + expr); } if (sample != null) config.setFloat("sample", sample); job.setMapperClass(CrawlDbDumpMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setJarByClass(CrawlDbReader.class); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "CrawlDbReader job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb dump: done"); } } public static class CrawlDbDumpMapper extends Mapper<Text, CrawlDatum, Text, CrawlDatum> { Pattern pattern = null; Matcher matcher = null; String status = null; Integer retry = null; Expression expr = null; float sample; @Override public void setup(Mapper<Text, CrawlDatum, Text, CrawlDatum>.Context context) { Configuration config = context.getConfiguration(); if (config.get("regex", null) != null) { pattern = Pattern.compile(config.get("regex")); } status = config.get("status", null); retry = config.getInt("retry", -1); if (config.get("expr", null) != null) { expr = JexlUtil.parseExpression(config.get("expr", null)); } sample = config.getFloat("sample", 1); } @Override public void map(Text key, CrawlDatum value, Context context) throws IOException, InterruptedException { // check sample if (sample < 1 && Math.random() > sample) { return; } // check retry if (retry != -1) { if (value.getRetriesSinceFetch() < retry) { return; } } // check status if (status != null && !status.equalsIgnoreCase(CrawlDatum.getStatusName(value.getStatus()))) return; // check regex if (pattern != null) { matcher = pattern.matcher(key.toString()); if (!matcher.matches()) { return; } } // check expr if (expr != null) { if (!value.evaluate(expr, key.toString())) { return; } } context.write(key, value); } } public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config) throws IOException, ClassNotFoundException, InterruptedException { if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")"); LOG.info("CrawlDb db: " + crawlDb); } Path outFolder = new Path(output); Path tempDir = new Path(config.get("mapreduce.cluster.temp.dir", ".") + "/readdb-topN-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = NutchJob.getInstance(config); job.setJobName("topN prepare " + crawlDb); FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setJarByClass(CrawlDbReader.class); job.setMapperClass(CrawlDbTopNMapper.class); job.setReducerClass(Reducer.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); job.getConfiguration().setFloat("db.reader.topn.min", min); FileSystem fs = tempDir.getFileSystem(config); try { boolean success = job.waitForCompletion(true); if (!success) { String message = "CrawlDbReader job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); fs.delete(tempDir, true); throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error(StringUtils.stringifyException(e)); fs.delete(tempDir, true); throw e; } if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: collecting topN scores."); } job = NutchJob.getInstance(config); job.setJobName("topN collect " + crawlDb); job.getConfiguration().setLong("db.reader.topn", topN); FileInputFormat.addInputPath(job, tempDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(Mapper.class); job.setReducerClass(CrawlDbTopNReducer.class); job.setJarByClass(CrawlDbReader.class); FileOutputFormat.setOutputPath(job, outFolder); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); // create a single file. try { boolean success = job.waitForCompletion(true); if (!success) { String message = "CrawlDbReader job did not succeed, job status:" + job.getStatus().getState() + ", reason: " + job.getStatus().getFailureInfo(); LOG.error(message); fs.delete(tempDir, true); throw new RuntimeException(message); } } catch (IOException | InterruptedException | ClassNotFoundException e) { LOG.error(StringUtils.stringifyException(e)); fs.delete(tempDir, true); throw e; } fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb topN: done"); } } public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException, Exception { @SuppressWarnings("resource") CrawlDbReader dbr = new CrawlDbReader(); if (args.length < 2) { System.err.println( "Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)"); System.err.println("\t<crawldb>\tdirectory name where crawldb is located"); System.err.println("\t-stats [-sort] \tprint overall statistics to System.out"); System.err.println("\t\t[-sort]\tlist status sorted by host"); System.err.println( "\t-dump <out_dir> [-format normal|csv|crawldb]\tdump the whole db to a text file in <out_dir>"); System.err.println("\t\t[-format csv]\tdump in Csv format"); System.err.println("\t\t[-format normal]\tdump in standard format (default option)"); System.err.println("\t\t[-format crawldb]\tdump as CrawlDB"); System.err.println("\t\t[-regex <expr>]\tfilter records with expression"); System.err.println("\t\t[-retry <num>]\tminimum retry count"); System.err.println("\t\t[-status <status>]\tfilter records by CrawlDatum status"); System.err.println("\t\t[-expr <expr>]\tJexl expression to evaluate for this record"); System.err.println("\t\t[-sample <fraction>]\tOnly process a random sample with this ratio"); System.err.println("\t-url <url>\tprint information on <url> to System.out"); System.err .println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>"); System.err.println("\t\t[<min>]\tskip records with scores below this value."); System.err.println("\t\t\tThis can significantly improve performance."); return -1; } String param = null; String crawlDb = args[0]; this.crawlDb = crawlDb; int numConsumed = 0; Job job = NutchJob.getInstance(getConf()); Configuration config = job.getConfiguration(); for (int i = 1; i < args.length; i++) { if (args[i].equals("-stats")) { boolean toSort = false; if (i < args.length - 1 && "-sort".equals(args[i + 1])) { toSort = true; i++; } dbr.processStatJob(crawlDb, config, toSort); } else if (args[i].equals("-dump")) { param = args[++i]; String format = "normal"; String regex = null; Integer retry = null; String status = null; String expr = null; Float sample = null; for (int j = i + 1; j < args.length; j++) { if (args[j].equals("-format")) { format = args[++j]; i = i + 2; } if (args[j].equals("-regex")) { regex = args[++j]; i = i + 2; } if (args[j].equals("-retry")) { retry = Integer.parseInt(args[++j]); i = i + 2; } if (args[j].equals("-status")) { status = args[++j]; i = i + 2; } if (args[j].equals("-expr")) { expr = args[++j]; i = i + 2; } if (args[j].equals("-sample")) { sample = Float.parseFloat(args[++j]); i = i + 2; } } dbr.processDumpJob(crawlDb, param, config, format, regex, status, retry, expr, sample); } else if (args[i].equals("-url")) { param = args[++i]; StringBuilder output = new StringBuilder(); dbr.readUrl(crawlDb, param, config, output); System.out.print(output); } else if (args[i].equals("-topN")) { param = args[++i]; long topN = Long.parseLong(param); param = args[++i]; float min = 0.0f; if (i < args.length - 1) { min = Float.parseFloat(args[++i]); } dbr.processTopNJob(crawlDb, topN, min, param, config); } else if ((numConsumed = super.parseArgs(args, i)) > 0) { i += numConsumed - 1; } else { System.err.println("\nError: wrong argument " + args[i]); return -1; } } if (numConsumed > 0) { // Start listening return super.run(); } return 0; } public static void main(String[] args) throws Exception { int result = ToolRunner.run(NutchConfiguration.create(), new CrawlDbReader(), args); System.exit(result); } public Object query(Map<String, String> args, Configuration conf, String type, String crawlId) throws Exception { Map<String, Object> results = new HashMap<>(); String crawlDb = crawlId + "/crawldb"; if (type.equalsIgnoreCase("stats")) { boolean sort = false; if (args.containsKey("sort")) { if (args.get("sort").equalsIgnoreCase("true")) sort = true; } TreeMap<String, Writable> stats = processStatJobHelper(crawlDb, NutchConfiguration.create(), sort); LongWritable totalCnt = (LongWritable) stats.get("T"); stats.remove("T"); results.put("totalUrls", String.valueOf(totalCnt.get())); Map<String, Object> statusMap = new HashMap<>(); for (Map.Entry<String, Writable> entry : stats.entrySet()) { String k = entry.getKey(); long val = 0L; double fval = 0.0; if (entry.getValue() instanceof LongWritable) { val = ((LongWritable) entry.getValue()).get(); } else if (entry.getValue() instanceof FloatWritable) { fval = ((FloatWritable) entry.getValue()).get(); } else if (entry.getValue() instanceof BytesWritable) { continue; } if (k.equals("scn")) { results.put("minScore", String.valueOf(fval)); } else if (k.equals("scx")) { results.put("maxScore", String.valueOf(fval)); } else if (k.equals("sct")) { results.put("avgScore", String.valueOf((fval / totalCnt.get()))); } else if (k.startsWith("status")) { String[] st = k.split(" "); int code = Integer.parseInt(st[1]); if (st.length > 2) { @SuppressWarnings("unchecked") Map<String, Object> individualStatusInfo = (Map<String, Object>) statusMap .get(String.valueOf(code)); Map<String, String> hostValues; if (individualStatusInfo.containsKey("hostValues")) { hostValues = (Map<String, String>) individualStatusInfo.get("hostValues"); } else { hostValues = new HashMap<>(); individualStatusInfo.put("hostValues", hostValues); } hostValues.put(st[2], String.valueOf(val)); } else { Map<String, Object> individualStatusInfo = new HashMap<>(); individualStatusInfo.put("statusValue", CrawlDatum.getStatusName((byte) code)); individualStatusInfo.put("count", String.valueOf(val)); statusMap.put(String.valueOf(code), individualStatusInfo); } } else { results.put(k, String.valueOf(val)); } } results.put("status", statusMap); return results; } if (type.equalsIgnoreCase("dump")) { String output = args.get("out_dir"); String format = "normal"; String regex = null; Integer retry = null; String status = null; String expr = null; Float sample = null; if (args.containsKey("format")) { format = args.get("format"); } if (args.containsKey("regex")) { regex = args.get("regex"); } if (args.containsKey("retry")) { retry = Integer.parseInt(args.get("retry")); } if (args.containsKey("status")) { status = args.get("status"); } if (args.containsKey("expr")) { expr = args.get("expr"); } if (args.containsKey("sample")) { sample = Float.parseFloat(args.get("sample")); } processDumpJob(crawlDb, output, conf, format, regex, status, retry, expr, sample); File dumpFile = new File(output + "/part-00000"); return dumpFile; } if (type.equalsIgnoreCase("topN")) { String output = args.get("out_dir"); long topN = Long.parseLong(args.get("nnn")); float min = 0.0f; if (args.containsKey("min")) { min = Float.parseFloat(args.get("min")); } processTopNJob(crawlDb, topN, min, output, conf); File dumpFile = new File(output + "/part-00000"); return dumpFile; } if (type.equalsIgnoreCase("url")) { String url = args.get("url"); CrawlDatum res = get(crawlDb, url, conf); results.put("status", res.getStatus()); results.put("fetchTime", new Date(res.getFetchTime())); results.put("modifiedTime", new Date(res.getModifiedTime())); results.put("retriesSinceFetch", res.getRetriesSinceFetch()); results.put("retryInterval", res.getFetchInterval()); results.put("score", res.getScore()); results.put("signature", StringUtil.toHexString(res.getSignature())); Map<String, String> metadata = new HashMap<>(); if (res.getMetaData() != null) { for (Entry<Writable, Writable> e : res.getMetaData().entrySet()) { metadata.put(String.valueOf(e.getKey()), String.valueOf(e.getValue())); } } results.put("metadata", metadata); return results; } return results; } }