Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.util.hostdb; import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.text.SimpleDateFormat; import java.util.concurrent.BlockingQueue; import java.util.concurrent.SynchronousQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.Date; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.LockUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Tool to create a HostDB from the CrawlDB. It aggregates fetch status values by host and checks * DNS entries for hosts. */ public class HostDb extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(HostDb.class); public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); public static final String LOCK_NAME = ".locked"; public static final String CURRENT_NAME = "current"; public static final String HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD = "hostdb.purge.failed.hosts.threshold"; public static final String HOSTDB_NUM_RESOLVER_THREADS = "hostdb.num.resolvers.threads"; public static final String HOSTDB_RECHECK_INTERVAL = "hostdb.recheck.interval"; public static final String HOSTDB_CHECK_FAILED = "hostdb.check.failed"; public static final String HOSTDB_CHECK_NEW = "hostdb.check.new"; public static final String HOSTDB_CHECK_KNOWN = "hostdb.check.known"; public static final String HOSTDB_FORCE_CHECK = "hostdb.force.check"; public static final String HOSTDB_URL_FILTERING = "hostdb.url.filter"; public static final String HOSTDB_URL_NORMALIZING = "hostdb.url.normalize"; /** * Mapper ingesting HostDB and CrawlDB entries. Additionally it can also read host score info * from a plain text key/value file generated by the Webgraph's NodeDumper tool. */ public static class HostDbMapper extends Mapper<Text, Writable, Text, NutchWritable> { private Text host = new Text(); private HostDatum hostDatum = null; private CrawlDatum crawlDatum = null; private String reprUrl = null; private String buffer = null; private boolean filter = false; private boolean normalize = false; private boolean readingCrawlDb = false; private URLFilters filters = null; private URLNormalizers normalizers = null; public void setup(Context context) { Configuration conf = context.getConfiguration(); readingCrawlDb = conf.getBoolean("hostdb.reading.crawldb", false); filter = conf.getBoolean(HOSTDB_URL_FILTERING, false); normalize = conf.getBoolean(HOSTDB_URL_NORMALIZING, false); if (filter) filters = new URLFilters(conf); if (normalize) normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); } /* Filters and or normalizes the input URL */ private String filterNormalize(String u) { boolean isHost = false; String url = u; if (!u.startsWith("http://") && !u.startsWith("https://")) { // We received a hostname here so let's make a URL url = "http://" + u + "/"; isHost = true; } try { if (normalizers != null) url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); if (filters != null) url = filters.filter(url); if (isHost && url == null) { // All hosts may not allow HTTP scheme and just allow HTTPS scheme. // So, try to force HTTPS for domains which are filtered with HTTP scheme // Note that this is a hacky way of getting around and does not work // for FTP and FILE schemes. String httpsUrl = "https://" + u + "/"; if (normalizers != null) httpsUrl = normalizers.normalize(httpsUrl, URLNormalizers.SCOPE_DEFAULT); if (filters != null) httpsUrl = filters.filter(httpsUrl); url = httpsUrl; } } catch (Exception e) { return null; } return url; } /** * Mapper ingesting records from the HostDB, CrawlDB and plain-text host scores * file. Statistics and scores are passed on. */ public void map(Text key, Writable value, Context context) throws IOException, InterruptedException { if (value instanceof CrawlDatum) { // This is a record from the CrawlDB // Get the normalized and filtered host of this URL buffer = filterNormalize(URLUtil.getHost(key.toString())); // Filtered out? if (buffer == null) { context.getCounter("HostDb", "filtered_records").increment(1); LOG.info(URLUtil.getHost(key.toString()) + " crawldatum has been filtered"); return; } // Set the host of this URL host.set(buffer); crawlDatum = (CrawlDatum) value; hostDatum = new HostDatum(); /** * Known limitation: * multi redirects: host_a => host_b/page => host_c/page/whatever * * We cannot re-resolve redirects for host objects as CrawlDatum metadata is * not available. We also cannot reliably use the reducer in all cases since * redirects may be across hosts or even domains. For now saving this for future * as multi-redirects are not very common on the entire internet. */ // Check if the current key is equals the host if (key.toString().equals("http://" + buffer + "/")) { // Check if this is a redirect to the real home page if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) { // Obtain the repr url for this redirect via protocol status from the metadata ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData() .get(Nutch.WRITABLE_PROTO_STATUS_KEY); // Get the protocol status' arguments reprUrl = z.getArgs()[0]; if (reprUrl != null) { LOG.info("Homepage: " + key.toString() + " redirects to: " + reprUrl); hostDatum.setHomepageUrl(reprUrl); } else { LOG.info("Homepage: " + key.toString() + " redirects to: " + reprUrl + " but has been filtered out"); } } else { hostDatum.setHomepageUrl("http://" + buffer + "/"); LOG.info("Homepage: " + "http://" + buffer + "/"); } } hostDatum.setStat(crawlDatum.getStatus(), 1); context.write(host, new NutchWritable(hostDatum)); } else if (value instanceof HostDatum) { // we got a record from the hostdb buffer = filterNormalize(key.toString()); // Filtered out? if (buffer == null) { context.getCounter("HostDb", "filtered_records").increment(1); LOG.info(key.toString() + " hostdatum has been filtered"); return; } // Get a HostDatum hostDatum = (HostDatum) value; key.set(buffer); // If we're also reading CrawlDb entries, reset db_* statistics because // we're aggregating them from CrawlDB anyway if (readingCrawlDb) hostDatum.resetStatistics(); context.write(key, new NutchWritable(hostDatum)); } else if (value instanceof Text) { // we got a record with host scores buffer = filterNormalize(key.toString()); // Filtered out? if (buffer == null) { context.getCounter("HostDb", "filtered_records").increment(1); LOG.info(key.toString() + " score has been filtered"); return; } key.set(buffer); context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString())))); } } } static class HostDbReducer extends Reducer<Text, NutchWritable, Text, HostDatum> { private ResolverThread resolverThread = null; private Integer numResolverThreads = 10; private static Integer purgeFailedHostsThreshold = -1; private static Integer recheckInterval = 86400000; private static boolean checkFailed = false; private static boolean checkNew = false; private static boolean checkKnown = false; private static boolean force = false; private static long now = new Date().getTime(); private BlockingQueue<Runnable> queue = new SynchronousQueue<Runnable>(); private ThreadPoolExecutor executor = null; /** * Configures the thread pool and prestarts all resolver threads. */ public void setup(Context context) { Configuration conf = context.getConfiguration(); purgeFailedHostsThreshold = conf.getInt(HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD, -1); numResolverThreads = conf.getInt(HOSTDB_NUM_RESOLVER_THREADS, 10); recheckInterval = conf.getInt(HOSTDB_RECHECK_INTERVAL, 86400) * 1000; checkFailed = conf.getBoolean(HOSTDB_CHECK_FAILED, false); checkNew = conf.getBoolean(HOSTDB_CHECK_NEW, false); checkKnown = conf.getBoolean(HOSTDB_CHECK_KNOWN, false); force = conf.getBoolean(HOSTDB_FORCE_CHECK, false); // Initialize the thread pool with our queue executor = new ThreadPoolExecutor(numResolverThreads, numResolverThreads, 5, TimeUnit.SECONDS, queue); // Run all threads in the pool executor.prestartAllCoreThreads(); } public void reduce(Text key, Iterable<NutchWritable> values, Context context) throws IOException, InterruptedException { HostDatum hostDatum = new HostDatum(); float score = 0; // Loop through all values until we find a non-empty HostDatum or use an empty if this is a new host for the host db for (Writable value : values) { if (value instanceof HostDatum) { HostDatum buffer = (HostDatum) value; // Increment statistics only if this is not an existing HostDatum if (hostDatum.isEmpty()) { hostDatum.addStat(CrawlDatum.STATUS_DB_UNFETCHED, buffer); hostDatum.addStat(CrawlDatum.STATUS_DB_FETCHED, buffer); hostDatum.addStat(CrawlDatum.STATUS_DB_GONE, buffer); hostDatum.addStat(CrawlDatum.STATUS_DB_REDIR_PERM, buffer); hostDatum.addStat(CrawlDatum.STATUS_DB_REDIR_TEMP, buffer); hostDatum.addStat(CrawlDatum.STATUS_DB_NOTMODIFIED, buffer); } // Check homepage URL if (buffer.hasHomepageUrl()) hostDatum.setHomepageUrl(buffer.getHomepageUrl()); // Check lastCheck timestamp if (!buffer.isEmpty()) hostDatum.setLastCheck(buffer.getLastCheck()); // Check and set failures if (buffer.getDnsFailures() > 0) hostDatum.setDnsFailures(buffer.getDnsFailures()); // Check and set failures if (buffer.getConnectionFailures() > 0) hostDatum.setConnectionFailures(buffer.getConnectionFailures()); // Check and set score (score from Web Graph has precedence) if (buffer.getScore() > 0) hostDatum.setScore(buffer.getScore()); } // Check for the score if (value instanceof FloatWritable) { FloatWritable buffer = (FloatWritable) value; score = buffer.get(); } } // Check if score was set from Web Graph if (score > 0) hostDatum.setScore(score); context.getCounter("HostDb", "total_hosts").increment(1); // See if this record is to be checked if (shouldCheck(hostDatum)) { // Make an entry resolverThread = new ResolverThread(key.toString(), hostDatum, context); // Add the entry to the queue (blocking) try { queue.put(resolverThread); } catch (InterruptedException e) { LOG.error("HostDb: " + StringUtils.stringifyException(e)); } // Do not progress, the datum will be written in the resolver thread return; } else { context.getCounter("HostDb", "skipped_not_eligible").increment(1); LOG.info(key.toString() + ": skipped_not_eligible"); } // Write the host datum if it wasn't written by the resolver thread context.write(key, hostDatum); } /** * Determines whether a record should be checked. */ private boolean shouldCheck(HostDatum datum) { // Whether a new record is to be checked if (checkNew && datum.isEmpty()) { return true; } // Whether existing known hosts should be rechecked if (checkKnown && !datum.isEmpty() && datum.getDnsFailures() == 0) { return isEligibleForCheck(datum); } // Whether failed records are forced to be rechecked if (checkFailed && datum.getDnsFailures() > 0) { return isEligibleForCheck(datum); } // It seems this record is not to be checked return false; } /** * Determines whether a record is eligible for recheck */ private boolean isEligibleForCheck(HostDatum datum) { // Whether an existing host, known or unknown, if forced to be rechecked return (force || datum.getLastCheck().getTime() + (recheckInterval * datum.getDnsFailures() + 1) < now); } /** * Shut down all running threads and wait for completion. */ public void close() { LOG.info("Feeder finished, waiting for shutdown"); // If we're here all keys have been fed and we can issue a shut down executor.shutdown(); boolean finished = false; while (!finished) { try { // Wait for the executor to shut down completely if (!executor.isTerminated()) { LOG.info("Threads waiting: " + Integer.toString(executor.getPoolSize())); Thread.sleep(1000); } else { // All is well, get out finished = true; } } catch (InterruptedException e) { LOG.warn(StringUtils.stringifyException(e)); } } } static class ResolverThread implements Runnable { private String host = null; private HostDatum datum = null; private Text hostText = new Text(); private Context context = null; public ResolverThread(String host, HostDatum datum, Context context) { hostText.set(host); this.host = host; this.datum = datum; this.context = context; } public void run() { // Resolve the host and act appropriately datum.setLastCheck(); try { // Throws an exception if host is not found InetAddress.getByName(host); if (datum.isEmpty()) { context.getCounter("HostDb", "new_known_host").increment(1); LOG.info(host + ": new_known_host " + datum); } else if (datum.getDnsFailures() > 0) { context.getCounter("HostDb", "rediscovered_host").increment(1); datum.setDnsFailures(0); LOG.info(host + ": rediscovered_host " + datum); } else { context.getCounter("HostDb", "existing_known_host").increment(1); LOG.info(host + ": existing_known_host " + datum); } // Write the host datum context.write(hostText, datum); } catch (UnknownHostException e) { try { // If the counter is empty we'll initialize with date = today and 1 failure if (datum.isEmpty()) { datum.setDnsFailures(1); context.write(hostText, datum); context.getCounter("HostDb", "new_unknown_host").increment(1); LOG.info(host + ": new_unknown_host " + datum); } else { datum.incDnsFailures(); // Check if this host should be forgotten if (purgeFailedHostsThreshold == -1 || purgeFailedHostsThreshold < datum.getDnsFailures()) { context.write(hostText, datum); context.getCounter("HostDb", "existing_unknown_host").increment(1); LOG.info(host + ": existing_unknown_host " + datum); } else { context.getCounter("HostDb", "purged_unknown_host").increment(1); LOG.info(host + ": purged_unknown_host " + datum); } } context.getCounter("HostDb", Integer.toString(datum.numFailures()) + "_times_failed") .increment(1); } catch (Exception ioe) { LOG.warn(StringUtils.stringifyException(ioe)); } } catch (Exception e) { LOG.warn(StringUtils.stringifyException(e)); } context.getCounter("HostDb", "checked_hosts").increment(1); } } } private void hostDb(Path hostDb, Path crawlDb, Path topHosts, boolean checkFailed, boolean checkNew, boolean checkKnown, boolean force, boolean filter, boolean normalize) throws Exception { long start = System.currentTimeMillis(); LOG.info("HostDb: starting at " + sdf.format(start)); Configuration conf = getConf(); conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); conf.setBoolean(HOSTDB_CHECK_FAILED, checkFailed); conf.setBoolean(HOSTDB_CHECK_NEW, checkNew); conf.setBoolean(HOSTDB_CHECK_KNOWN, checkKnown); conf.setBoolean(HOSTDB_FORCE_CHECK, force); conf.setBoolean(HOSTDB_URL_FILTERING, filter); conf.setBoolean(HOSTDB_URL_NORMALIZING, normalize); // Check whether the urlfilter-domainblacklist plugin is loaded if (filter && "urlfilter-domainblacklist".matches(conf.get("plugin.includes"))) { throw new Exception("domainblacklist-urlfilter must not be enabled"); } // Check whether the urlnormalizer-host plugin is loaded if (normalize && "urlnormalizer-host".matches(conf.get("plugin.includes"))) { throw new Exception("urlnormalizer-host must not be enabled"); } FileSystem fs = FileSystem.get(conf); Path old = new Path(hostDb, "old"); Path current = new Path(hostDb, CURRENT_NAME); Path tempHostDb = new Path(hostDb, "hostdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // lock an existing hostdb to prevent multiple simultaneous updates Path lock = new Path(hostDb, LOCK_NAME); if (!fs.exists(current)) { fs.mkdirs(current); } LockUtil.createLockFile(fs, lock, false); Job job = new Job(conf, "HostDb " + hostDb); job.setJarByClass(HostDb.class); job.setSpeculativeExecution(false); MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class); if (topHosts != null) { MultipleInputs.addInputPath(job, topHosts, KeyValueTextInputFormat.class); } if (crawlDb != null) { // Tell the job we read from CrawlDB conf.setBoolean("hostdb.reading.crawldb", true); MultipleInputs.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME), SequenceFileInputFormat.class); } FileOutputFormat.setOutputPath(job, tempHostDb); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HostDatum.class); job.setMapperClass(HostDbMapper.class); job.setReducerClass(HostDbReducer.class); try { job.waitForCompletion(true); FSUtils.replace(fs, old, current, true); FSUtils.replace(fs, current, tempHostDb, true); boolean preserveBackup = conf.getBoolean("db.preserve.backup", true); if (!preserveBackup && fs.exists(old)) fs.delete(old, true); } catch (Exception e) { if (fs.exists(tempHostDb)) { fs.delete(tempHostDb, true); } LockUtil.removeLockFile(fs, lock); throw e; } LockUtil.removeLockFile(fs, lock); long end = System.currentTimeMillis(); LOG.info("HostDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } public static void main(String args[]) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new HostDb(), args); System.exit(res); } public static void usage() { System.err.println( "Usage: HostDb <hostdb> " + "[-crawldb <crawldb>] [-tophosts <tophosts>] [-checkAll] [-checkFailed]" + " [-checkNew] [-checkKnown] [-force] [-noFilter] [-noNormalize]"); System.err.println("\t<hostdb>\tdirectory name where hostdb is located"); System.err.println("\t-crawldb <crawldb>\tpath to a crawldb directory"); System.err.println( "\t-tophosts <tophosts>\tkey-value text file from the Webgraph's NodeDumper tool having score"); System.err.println("\t-checkAll\tApply DNS check to resolve all hosts"); System.err.println( "\t-checkFailed\tApply DNS check to resolve only on hosts which had failed DNS check earlier"); System.err.println("\t-checkNew\tApply DNS check to resolve only new hosts"); System.err.println("\t-checkKnown\tApply DNS check to resolve only known hosts"); System.err.println("\t-force\t\tforce hosts to be rechecked. With earlier args, check " + "is done on host only if 'recheckInterval' has elapsed."); System.err.println("\t-noFilter\tturn off URLFilters on urls"); System.err.println("\t-noNormalize\tturn off URLNormalizer on urls"); } public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return -1; } Path hostDb = new Path(args[0]); Path crawlDb = null; Path topHosts = null; boolean checkFailed = false; boolean checkNew = false; boolean checkKnown = false; boolean force = false; boolean filter = true; boolean normalize = true; for (int i = 1; i < args.length; i++) { if (args[i].equals("-crawldb")) { crawlDb = new Path(args[++i]); LOG.info("HostDb: crawldb: " + crawlDb); } else if (args[i].equals("-tophosts")) { topHosts = new Path(args[++i]); LOG.info("HostDb: tophosts: " + topHosts); } else if (args[i].equals("-checkFailed")) { LOG.info("HostDb: checking failed hosts"); checkFailed = true; } else if (args[i].equals("-checkNew")) { LOG.info("HostDb: checking new hosts"); checkNew = true; } else if (args[i].equals("-checkKnown")) { LOG.info("HostDb: checking known hosts"); checkKnown = true; } else if (args[i].equals("-checkAll")) { LOG.info("HostDb: checking all hosts"); checkFailed = true; checkNew = true; checkKnown = true; } else if (args[i].equals("-force")) { LOG.info("HostDb: forced check"); force = true; } else if (args[i].equals("-noFilter")) { LOG.info("HostDb: filtering disabled"); filter = false; } else if (args[i].equals("-noNormalize")) { LOG.info("HostDb: normalizing disabled"); normalize = false; } else { LOG.info("HostDb: Found invalid argument \"" + args[i] + "\"\n"); usage(); return -1; } } try { hostDb(hostDb, crawlDb, topHosts, checkFailed, checkNew, checkKnown, force, filter, normalize); return 0; } catch (Exception e) { LOG.error("HostDb: " + StringUtils.stringifyException(e)); return -1; } } }