Java tutorial
/* Nutchwax * * $Id: Nutchwax.java 1896 2007-08-01 21:44:31Z jlee-archive $ * * Created on Feb 14, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.access.nutch; import java.io.FileNotFoundException; import java.io.IOException; import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.JobConf; import org.apache.nutch.crawl.CrawlDb; import org.apache.nutch.crawl.Generator; import org.apache.nutch.indexer.DeleteDuplicates; import org.apache.nutch.indexer.IndexMerger; import org.archive.access.nutch.jobs.ImportArcs; import org.archive.access.nutch.jobs.NutchwaxCrawlDb; import org.archive.access.nutch.jobs.NutchwaxIndexer; import org.archive.access.nutch.jobs.NutchwaxLinkDb; import org.archive.access.nutch.jobs.NutchwaxPagerank; import org.archive.util.ArchiveUtils; import org.apache.nutch.global.Global; /** * Script to run all indexing jobs from index through merge of final index. */ public class Nutchwax { public static final Log LOG = LogFactory.getLog(Nutchwax.class.getName()); private static final String KEY_COLLECTION_PREFIX = "c="; private static final String KEY_COLLECTION_SUFFIX = ",u="; private static final Pattern COLLECTION = Pattern.compile("^\\s*c=([^,]+),u=(.*)\\s*", Pattern.DOTALL); private final static List JOBS = Arrays.asList(new String[] { "import", "update", "invert", "pagerank", "index", "dedup", "merge", "all", "class", "search", "multiple", "version" }); // Lazy initialize these two variables to delay complaint about hadoop not // being present -- if its not. Meantime I get command-line processing // done. private FileSystem fs = null; private JobConf conf = null; /** * Default constructor. * @throws IOException */ public Nutchwax() throws IOException { super(); } public synchronized JobConf getJobConf() { if (this.conf == null) { this.conf = new JobConf(NutchwaxConfiguration.getConfiguration()); } return this.conf; } public synchronized FileSystem getFS() throws IOException { if (this.fs == null) { this.fs = FileSystem.get(getJobConf()); } return this.fs; } public class OutputDirectories { private final Path output; private final Path crawlDb; private final Path linkDb; private final Path pagerank; private final Path segments; private final Path indexes; private final Path index; private final Path tmpDir; public OutputDirectories(final Path output) throws IOException { this.output = output; this.crawlDb = new Path(output + "/crawldb"); this.linkDb = new Path(output + "/linkdb"); this.pagerank = new Path(output + "/pagerank"); this.segments = new Path(output + "/segments"); this.indexes = new Path(output + "/indexes"); this.index = new Path(output + "/index"); this.tmpDir = getJobConf().getLocalPath("mapred.temp.dir", Generator.generateSegmentName()); } public Path getCrawlDb() { return crawlDb; } public Path getIndexes() { return indexes; } public Path getLinkDb() { return linkDb; } public Path getPagerank() { return pagerank; } public Path getSegments() { return segments; } public Path getTmpDir() { return tmpDir; } public Path getIndex() { return index; } public Path getOutput() { return output; } } /** * Run passed list of mapreduce indexing jobs. Jobs are always run in * order: import, update, etc. * * @throws Exception */ protected void doAll(final Path input, final String collectionName, final OutputDirectories od) throws Exception { doImport(input, collectionName, od); doUpdate(od); doInvert(od); doPagerank(od); doIndexing(od); doDedup(od); doMerge(od); LOG.info("Nutchwax finished."); } protected void doImport(final Path input, String collectionName, final OutputDirectories od) throws IOException { Path segment = new Path(od.getSegments(), Generator.generateSegmentName() + ((collectionName == null || collectionName.length() <= 0) ? "" : "-" + collectionName)); new ImportArcs(getJobConf()).importArcs(input, segment, collectionName); } protected void doUpdate(final OutputDirectories od) throws IOException { doUpdate(od, null); } protected void doUpdate(final OutputDirectories od, final String[] segments) throws IOException { LOG.info("updating crawldb " + od.getCrawlDb()); // Need to make sure the db dir exists before progressing. Path dbPath = new Path(od.getCrawlDb(), CrawlDb.CURRENT_NAME); if (!getFS().exists(dbPath)) { getFS().mkdirs(dbPath); } CrawlDb cdb = new NutchwaxCrawlDb(getJobConf()); if (segments != null) { List<Path> paths = new ArrayList<Path>(segments.length); for (int i = 0; i < segments.length; i++) { Path p = new Path(segments[i]); if (!getFS().exists(p)) { throw new FileNotFoundException(p.toString()); } paths.add(p); } cdb.update(od.getCrawlDb(), paths.toArray(new Path[paths.size()]), true, true); } else { Path[] allSegments = getSegments(od); // This just does the last segment created. cdb.update(od.getCrawlDb(), new Path[] { allSegments[allSegments.length - 1] }, true, true); } } protected Path[] getSegments(final OutputDirectories od) throws IOException { Path[] allSegments = getFS().listPaths(od.getSegments()); if (allSegments == null || allSegments.length <= 0) { throw new FileNotFoundException(od.getSegments().toString()); } return allSegments; } protected void doInvert(final OutputDirectories od, final Path[] segments) throws IOException { createLinkdb(od); new NutchwaxLinkDb(getJobConf()).invert(od.getLinkDb(), segments, true, true, false); } protected void doInvert(final OutputDirectories od) throws IOException { LOG.info("inverting links in " + od.getSegments()); new NutchwaxLinkDb(getJobConf()).invert(od.getLinkDb(), getSegments(od), true, true, false); } protected boolean createLinkdb(final OutputDirectories od) throws IOException { boolean result = false; // Make sure the linkdb exists. Otherwise the install where // the temporary location gets moved to the permanent fails. if (getFS().mkdirs(new Path(od.getLinkDb(), NutchwaxLinkDb.CURRENT_NAME))) { LOG.info("Created " + od.getLinkDb()); result = true; } return result; } protected void doPagerank(final OutputDirectories od) throws IOException { LOG.info("computing pagerank scores in " + od.getPagerank()); new NutchwaxPagerank(getJobConf()).process(getSegments(od), od.getPagerank()); } protected void doIndexing(final OutputDirectories od) throws IOException { doIndexing(od, getFS().listPaths(od.getSegments())); } protected void doIndexing(final OutputDirectories od, final Path[] segments) throws IOException { LOG.info(" indexing " + segments); new NutchwaxIndexer(getJobConf()).index(od.getIndexes(), od.getPagerank(), od.getCrawlDb(), od.getLinkDb(), segments); } protected void doDedup(final OutputDirectories od) throws IOException { LOG.info("dedup " + od.getIndex()); new DeleteDuplicates(getJobConf()).dedup(new Path[] { od.getIndexes() }); } protected void doMerge(final OutputDirectories od) throws IOException { LOG.info("index merge " + od.getOutput() + " using tmpDir=" + od.getTmpDir()); new IndexMerger(getJobConf()).merge(getFS().listPaths(od.getIndexes()), od.getIndex(), od.getTmpDir()); } static String[] rewriteArgs(final String[] args, final int offset) { final String[] newArgs = new String[args.length - offset]; for (int i = 0; i < args.length; i++) { if (i < offset) { continue; } newArgs[i - offset] = args[i]; } return newArgs; } static Object doClassMain(final String[] args) { // Redo args so absent our nutchwax 'class' command. final String className = args[1]; String[] newArgs = rewriteArgs(args, 2); // From http://www.javaworld.com/javaworld/javaqa/1999-06/01-outside.html Class[] argTypes = new Class[1]; argTypes[0] = String[].class; Object result = null; try { Method mainMethod = Class.forName(className).getDeclaredMethod("main", argTypes); result = mainMethod.invoke(newArgs, new Object[] { newArgs }); } catch (Throwable t) { t.printStackTrace(); } return result; } protected Object doSearch(final String[] args) { String[] newArgs = new String[args.length + 1]; newArgs[0] = args[0]; newArgs[1] = NutchwaxBean.class.getName(); for (int i = 1; i < args.length; i++) { newArgs[i + 1] = args[i]; } return doClassMain(newArgs); } protected void doMultiple(final String[] args) throws Exception { (new Multiple()).run(rewriteArgs(args, 1)); } protected void doVersion(final String[] args) throws Exception { JobConf job = getJobConf(); String collectionType = job.get(Global.COLLECTION_TYPE); System.out.println("Collection type:" + collectionType); } protected void doJob(final String jobName, final String[] args) throws Exception { if (jobName.equals("import")) { // Usage: hadoop jar nutchwax.jar import input output name if (args.length != 4) { ImportArcs.doImportUsage("ERROR: Wrong number of arguments passed.", 2); } final Path input = new Path(args[1]); final Path output = new Path(args[2]); final String collectionName = args[3]; checkArcsDir(input); OutputDirectories od = new OutputDirectories(output); doImport(input, collectionName, od); } else if (jobName.equals("update")) { // Usage: hadoop jar nutchwax.jar update output if (args.length < 2) { doUpdateUsage("ERROR: Wrong number of arguments passed.", 2); } OutputDirectories od = new OutputDirectories(new Path(args[1])); if (args.length == 2) { doUpdate(od); } else { for (int i = 2; i < args.length; i++) { doUpdate(od, new String[] { args[i] }); } } } else if (jobName.equals("invert")) { // Usage: hadoop jar nutchwax.jar invert output if (args.length < 2) { doInvertUsage("ERROR: Wrong number of arguments passed.", 2); } OutputDirectories od = new OutputDirectories(new Path(args[1])); if (args.length == 2) { doInvert(od); } else { final int offset = 2; Path[] segments = new Path[args.length - offset]; for (int i = offset; i < args.length; i++) { Path f = new Path(args[i]); if (!getFS().exists(f)) { throw new FileNotFoundException(f.toString()); } segments[i - offset] = f; } doInvert(od, segments); } } /* TODO MC */ else if (jobName.equals("pagerank")) { // Usage: hadoop jar nutchwax.jar pagerank output if (args.length != 2) { doPagerankUsage("ERROR: Wrong number of arguments passed.", 2); } OutputDirectories od = new OutputDirectories(new Path(args[1])); doPagerank(od); } /* TODO MC */ else if (jobName.equals("index")) { // Usage: hadoop jar nutchwax.jar index output if (args.length < 2) { doIndexUsage("ERROR: Wrong number of arguments passed.", 2); } OutputDirectories od = new OutputDirectories(new Path(args[1])); if (args.length == 2) { doIndexing(od); } else { final int offset = 2; Path[] segments = new Path[args.length - offset]; for (int i = offset; i < args.length; i++) { Path f = new Path(args[i]); if (!getFS().exists(f)) { throw new FileNotFoundException(f.toString()); } segments[i - offset] = f; } doIndexing(od, segments); } } else if (jobName.equals("dedup")) { // Usage: hadoop jar nutchwax.jar dedup output if (args.length != 2) { doDedupUsage("Wrong number of arguments passed.", 2); } doDedup(new OutputDirectories(new Path(args[1]))); } else if (jobName.equals("merge")) { // Usage: hadoop jar nutchwax.jar merge output"); if (args.length != 2) { doMergeUsage("ERROR: Wrong number of arguments passed.", 2); } doMerge(new OutputDirectories(new Path(args[1]))); } else if (jobName.equals("all")) { // Usage: hadoop jar nutchwax.jar import input output name if (args.length != 4) { doAllUsage("ERROR: Wrong number of arguments passed.", 2); } final Path input = new Path(args[1]); final Path output = new Path(args[2]); final String collectionName = args[3]; checkArcsDir(input); OutputDirectories od = new OutputDirectories(output); doAll(input, collectionName, od); } else if (jobName.equals("class")) { if (args.length < 2) { doClassUsage("ERROR: Wrong number of arguments passed.", 2); } doClassMain(args); } else if (jobName.equals("search")) { if (args.length < 1) { doClassUsage("ERROR: Wrong number of arguments passed.", 2); } doSearch(args); } else if (jobName.equals("multiple")) { doMultiple(args); } else if (jobName.equals("version")) { doVersion(args); } else { usage("ERROR: No handler for job name " + jobName, 4); System.exit(0); } } /** * Check the arcs dir exists and looks like it has files that list ARCs * (rather than ARCs themselves). * * @param arcsDir Directory to examine. * @throws IOException */ protected void checkArcsDir(final Path arcsDir) throws IOException { if (!getFS().exists(arcsDir)) { throw new IOException(arcsDir + " does not exist."); } if (!fs.isDirectory(arcsDir)) { throw new IOException(arcsDir + " is not a directory."); } final Path[] files = getFS().listPaths(arcsDir); for (int i = 0; i < files.length; i++) { if (!getFS().isFile(files[i])) { throw new IOException(files[i] + " is not a file."); } if (files[i].getName().toLowerCase().endsWith(".arc.gz")) { throw new IOException(files[i] + " is an ARC file (ARCSDIR " + "should contain text file listing ARCs rather than " + "actual ARCs)."); } } } public static Text generateWaxKey(WritableComparable key, final String collection) { return generateWaxKey(key.toString(), collection); } public static Text generateWaxKey(final String keyStr, final String collection) { if (collection == null) { throw new NullPointerException("Collection is null for " + keyStr); } if (keyStr == null) { throw new NullPointerException("keyStr is null"); } if (keyStr.startsWith(KEY_COLLECTION_PREFIX)) { LOG.warn("Key already has collection prefix: " + keyStr + ". Skipping."); return new Text(keyStr); } return new Text(KEY_COLLECTION_PREFIX + collection.trim() + KEY_COLLECTION_SUFFIX + keyStr.trim()); } public static String getCollectionFromWaxKey(final WritableComparable key) throws IOException { Matcher m = COLLECTION.matcher(key.toString()); if (m == null || !m.matches()) { throw new IOException("Key doesn't have collection " + "prefix <" + key.toString() + ">"); } return m.group(1); } public static String getUrlFromWaxKey(final WritableComparable key) throws IOException { Matcher m = COLLECTION.matcher(key.toString()); if (m == null || !m.matches()) { throw new IOException("Key doesn't have collection " + " prefix: " + key); } return m.group(2); } public static long getDate(String d) throws IOException { long date = 0; try { date = ArchiveUtils.getDate(d).getTime(); } catch (final java.text.ParseException e) { throw new IOException("Failed parse of date: " + d + ": " + e.getMessage()); } // Date can be < 0 if pre-1970 (Seen in some old ARCs). return date >= 0 ? date : 0; } public static void usage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar <job> [args]"); System.out.println("Launch NutchWAX job(s) on a hadoop platform."); System.out.println("Type 'hadoop jar nutchwax.jar help <job>' for" + " help on a specific job."); System.out.println("Jobs (usually) must be run in the order " + "listed below."); System.out.println("Available jobs:"); System.out.println(" import Import ARCs."); System.out.println(" update Update dbs with recent imports."); System.out.println(" invert Invert links."); System.out.println(" pagerank Compute pagerank."); // TODO MC System.out.println(" index Index segments."); System.out.println(" dedup Deduplicate by URL or content MD5."); System.out.println(" merge Merge segment indices into one."); System.out.println(" all Runs all above jobs in order."); System.out.println(" class Run the passed class's main."); System.out.println(" search Run a query against index under " + "property 'searcher.dir'"); System.out.println(" multiple Run multiple concurrent tasks."); System.out.println(" version Indicates the software version."); System.exit(exitCode); } public static void doUpdateUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar update <output> " + "[<segments>...]"); System.out.println("Arguments:"); System.out.println(" output Directory to write crawldb under."); System.out.println("Options:"); System.out.println(" segments List of segments to update crawldb " + "with. If none supplied, updates"); System.out.println(" using latest segment found."); System.exit(exitCode); } public static void doInvertUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar invert <output> " + "[<segments>...]"); System.out.println("Arguments:"); System.out.println(" output Directory to write linkdb under."); System.out.println("Options:"); System.out.println(" segments List of segments to update linkdb " + "with. If none supplied, all under"); System.out.println(" '<output>/segments/' " + "are passed."); System.exit(exitCode); } /* TODO MC */ public static void doPagerankUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar pagerank <output> "); System.out.println("Arguments:"); System.out.println(" output Directory to write pagerank under."); System.exit(exitCode); } /* TODO MC */ public static void doIndexUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar index <output> " + "[<segments>...]"); System.out.println("Arguments:"); System.out.println(" output Directory to write indexes under."); System.out.println("Options:"); System.out.println(" segments List of segments to index. " + "If none supplied, all under"); System.out.println(" '<output>/segments/' " + "are indexed."); System.exit(exitCode); } public static void doDedupUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar dedup <output>"); System.out.println("Arguments:"); System.out.println(" output Directory in which indices" + " to dedup reside."); System.exit(exitCode); } public static void doMergeUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar merge <output>"); System.out.println("Arguments:"); System.out.println(" output Directory in which indices" + " to merge reside."); System.exit(exitCode); } public static void doMultipleUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } Multiple.usage(); System.exit(exitCode); } public static void doSearchUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar search <query>"); System.out.println("Arguments:"); System.out.println(" query Query string to run against index under " + "property 'searcher.dir'"); System.exit(exitCode); } public static void doAllUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar import <input>" + " <output> <collection>"); System.out.println("Arguments:"); System.out.println(" input Directory of files" + " listing ARC URLs to import"); System.out.println(" output Directory to import to. Inport is " + "written to a subdir named"); System.out.println(" for current date plus collection " + "under '<output>/segments/'"); System.out.println(" collection Collection name. Added to" + " each resource."); System.exit(exitCode); } public static void doClassUsage(final String message, final int exitCode) { if (message != null && message.length() > 0) { System.out.println(message); } System.out.println("Usage: hadoop jar nutchwax.jar class CLASS ..."); System.out.println("Arguments:"); System.out.println(" CLASS Name of class to run. Invokes main " + "passing command-line arguments."); System.out.println(" For example, use to run nutch " + "commands. Below is list of command"); System.out.println(" name and implementing class. " + "Pass name of class only and emits usage."); System.out.println(); System.out.println(" readdb " + "org.apache.nutch.crawl.CrawlDbReader"); System.out.println(" mergedb " + "org.apache.nutch.crawl.CrawlDbMerger"); System.out.println(" readlinkdb " + "org.apache.nutch.crawl.LinkDbReader"); System.out.println(" segread " + "org.apache.nutch.segment.SegmentReader"); System.out.println(" mergesegs " + "org.apache.nutch.segment.SegmentMerger"); System.out.println(" mergelinkdb " + "org.apache.nutch.crawl.LinkDbMerger"); System.exit(exitCode); } static void doJobHelp(final String jobName) { if (!JOBS.contains(jobName)) { usage("ERROR: Unknown job " + jobName, 1); } if (jobName.equals("import")) { ImportArcs.doImportUsage(null, 1); } else if (jobName.equals("update")) { doUpdateUsage(null, 1); } else if (jobName.equals("invert")) { doInvertUsage(null, 1); } /* TODO MC */ else if (jobName.equals("pagerank")) { doPagerankUsage(null, 1); } /* TODO MC */ else if (jobName.equals("index")) { doIndexUsage(null, 1); } else if (jobName.equals("dedup")) { doDedupUsage(null, 1); } else if (jobName.equals("merge")) { doMergeUsage(null, 1); } else if (jobName.equals("all")) { doAllUsage(null, 1); } else if (jobName.equals("search")) { doSearchUsage(null, 1); } else if (jobName.equals("multiple")) { doMultipleUsage(null, 1); } else if (jobName.equals("class")) { doClassUsage(null, 1); } else { usage("ERROR: No help for job name " + jobName, 4); } } public static void main(String args[]) throws Exception { if (args.length < 1) { usage(null, 0); return; } if (args[0].toLowerCase().equals("help")) { if (args.length == 1) { usage("ERROR: Add command you need help on.", 0); return; } doJobHelp(args[1].toLowerCase()); } final String jobName = args[0].toLowerCase(); if (!JOBS.contains(jobName)) { usage("ERROR: Unknown <job> " + jobName, 1); } Nutchwax ia = new Nutchwax(); ia.doJob(jobName, args); } }