Java tutorial
/******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.apache.nutch.mapreduce; import org.apache.avro.util.Utf8; import org.apache.commons.lang3.StringUtils; import org.apache.gora.filter.MapFieldValueFilter; import org.apache.gora.store.DataStore; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.URLPartitioner.FetchEntryPartitioner; import org.apache.nutch.fetch.FetchMode; import org.apache.nutch.fetch.FetchMonitor; import org.apache.nutch.fetch.data.FetchEntry; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.protocol.ProtocolFactory; import org.apache.nutch.service.NutchMaster; import org.apache.nutch.storage.StorageUtils; import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.Params; import org.slf4j.Logger; import java.util.Collection; import java.util.HashSet; import java.util.Map; import static org.apache.nutch.metadata.Nutch.*; /** * Fetch job * */ public class FetchJob extends NutchJob implements Tool { private static final Logger LOG = FetchMonitor.LOG; public static final String PROTOCOL_REDIR = "protocol"; public static final int PERM_REFRESH_TIME = 5; public static final Utf8 REDIRECT_DISCOVERED = new Utf8("___rdrdsc__"); private static final Collection<WebPage.Field> FIELDS = new HashSet<>(); static { FIELDS.add(WebPage.Field.MARKERS); FIELDS.add(WebPage.Field.REPR_URL); FIELDS.add(WebPage.Field.FETCH_TIME); FIELDS.add(WebPage.Field.METADATA); } private int numTasks = 2; private String batchId = Nutch.ALL_BATCH_ID_STR; public FetchJob() { } public FetchJob(Configuration conf) { setConf(conf); } /** * The field list affects which field to reads, but does not affect which field to to write * */ public Collection<WebPage.Field> getFields(Job job) { Collection<WebPage.Field> fields = new HashSet<>(FIELDS); if (job.getConfiguration().getBoolean(PARAM_PARSE, false)) { ParserJob parserJob = new ParserJob(); fields.addAll(parserJob.getFields(job)); } ProtocolFactory protocolFactory = new ProtocolFactory(job.getConfiguration()); fields.addAll(protocolFactory.getFields()); return fields; } @Override protected void setup(Map<String, Object> args) throws Exception { super.setup(args); Params params = new Params(args); Configuration conf = getConf(); checkConfiguration(conf); String crawlId = params.get(ARG_CRAWL, conf.get(Nutch.PARAM_CRAWL_ID)); FetchMode fetchMode = params.getEnum(ARG_FETCH_MODE, conf.getEnum(PARAM_FETCH_MODE, FetchMode.NATIVE)); batchId = params.get(ARG_BATCH, ALL_BATCH_ID_STR); int threads = params.getInt(ARG_THREADS, 5); boolean resume = params.getBoolean(ARG_RESUME, false); int limit = params.getInt(ARG_LIMIT, -1); numTasks = params.getInt(ARG_NUMTASKS, conf.getInt(PARAM_MAPREDUCE_JOB_REDUCES, 2)); boolean index = params.getBoolean(ARG_INDEX, false); /** Solr */ String solrUrl = params.get(ARG_SOLR_URL, conf.get(PARAM_SOLR_SERVER_URL)); String zkHostString = params.get(ARG_ZK, conf.get(PARAM_SOLR_ZK)); String solrCollection = params.get(ARG_COLLECTION, conf.get(PARAM_SOLR_COLLECTION)); /** Set re-computed config variables */ NutchConfiguration.setIfNotNull(conf, PARAM_CRAWL_ID, crawlId); conf.setEnum(PARAM_FETCH_MODE, fetchMode); NutchConfiguration.setIfNotNull(conf, PARAM_BATCH_ID, batchId); conf.setInt(PARAM_THREADS, threads); conf.setBoolean(PARAM_RESUME, resume); conf.setInt(PARAM_MAPPER_LIMIT, limit); conf.setInt(PARAM_MAPREDUCE_JOB_REDUCES, numTasks); conf.setBoolean(PARAM_INDEX_JUST_IN_TIME, index); NutchConfiguration.setIfNotNull(conf, PARAM_SOLR_SERVER_URL, solrUrl); NutchConfiguration.setIfNotNull(conf, PARAM_SOLR_ZK, zkHostString); NutchConfiguration.setIfNotNull(conf, PARAM_SOLR_COLLECTION, solrCollection); LOG.info(Params.format("className", this.getClass().getSimpleName(), "crawlId", crawlId, "batchId", batchId, "fetchMode", fetchMode, "numTasks", numTasks, "threads", threads, "resume", resume, "limit", limit, "index", index, "solrUrl", solrUrl, "zkHostString", zkHostString, "solrCollection", solrCollection)); } @Override protected void doRun(Map<String, Object> args) throws Exception { // For politeness, don't permit parallel execution of a single task currentJob.setReduceSpeculativeExecution(false); Collection<WebPage.Field> fields = getFields(currentJob); MapFieldValueFilter<String, WebPage> batchIdFilter = getBatchIdFilter(batchId); StorageUtils.initMapperJob(currentJob, fields, IntWritable.class, FetchEntry.class, FetchMapper.class, FetchEntryPartitioner.class, batchIdFilter, false); StorageUtils.initReducerJob(currentJob, FetchReducer.class); currentJob.setNumReduceTasks(numTasks); // used to get schema name DataStore<String, WebPage> store = StorageUtils.createWebStore(getConf(), String.class, WebPage.class); LOG.debug("Loaded Query Fields : " + StringUtils.join(StorageUtils.toStringArray(fields), ", ")); LOG.info(Params.format("className", this.getClass().getSimpleName(), "workingDir", currentJob.getWorkingDirectory(), "jobName", currentJob.getJobName(), "realSchema", store.getSchemaName())); currentJob.waitForCompletion(true); } public int fetch(String crawlId, String fetchMode, String batchId, int threads, boolean resume, int limit) throws Exception { return fetch(crawlId, fetchMode, batchId, threads, resume, limit, 2, false, null, null, null); } public int fetch(String crawlId, String fetchMode, String batchId, int threads, boolean resume, int limit, int numTasks) throws Exception { return fetch(crawlId, fetchMode, batchId, threads, resume, limit, numTasks, false, null, null, null); } /** * Run fetcher. * * @param batchId * batchId (obtained from Generator) or null to fetch all generated * fetchlists * @param threads * number of threads per map task * @param resume * @param numTasks * number of fetching tasks (reducers). If set to < 1 then use the * default, which is mapreduce.job.reduces. * @return 0 on success * @throws Exception * */ public int fetch(String crawlId, String fetchMode, String batchId, int threads, boolean resume, int limit, int numTasks, boolean index, String solrUrl, String zkHostString, String collection) throws Exception { run(Params.toArgMap(ARG_CRAWL, crawlId, ARG_FETCH_MODE, fetchMode, ARG_BATCH, batchId, ARG_THREADS, threads, ARG_RESUME, resume, ARG_NUMTASKS, numTasks > 0 ? numTasks : null, ARG_LIMIT, limit > 0 ? limit : null, ARG_NUMTASKS, numTasks > 0 ? numTasks : null, ARG_INDEX, index, ARG_SOLR_URL, solrUrl, ARG_ZK, zkHostString, ARG_COLLECTION, collection)); return 0; } void checkConfiguration(Configuration conf) { // ensure that a value has been set for the agent name String agentName = conf.get("http.agent.name"); if (agentName == null || agentName.trim().length() == 0) { String message = "No agents listed in 'http.agent.name'" + " property."; LOG.error(message); throw new IllegalArgumentException(message); } } private void printUsage() { String usage = "Usage: FetchJob (<batchId> | -all) [-crawlId <id>] [-threads N] " + "\n \t \t [-resume] [-numTasks N]\n" + "\n \t \t [-solrUrl url] [-zkHostString zk] [-collection collection]\n" + " <batchId> - crawl identifier returned by Generator, or -all for all \n \t \t generated batchId-s\n" + " -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)\n" + " -fetchMode <mode> - the fetch mode, can be one of [native|proxy|crowdsourcing], \n \t \t (default: fetcher.fetch.mode));" + " -threads N - number of fetching threads per task\n" + " -resume - resume interrupted job\n" + " -index - index in time\n" + " -numTasks N - if N > 0 then use this many reduce tasks for fetching \n \t \t (default: mapreduce.job.reduces)" + " -solrUrl - solr server url, for example, http://localhost:8983/solr/gettingstarted\n" + " -zkHostString - zk host string, zoomkeeper higher priority then solrUrl\n" + " -collection - collection name if zkHostString is specified\n"; System.err.println(usage); } public int run(String[] args) throws Exception { if (args.length == 0) { printUsage(); return -1; } String batchId = args[0]; if (!batchId.equals("-all") && batchId.startsWith("-")) { printUsage(); return -1; } String crawlId = null; String fetchMode = null; String solrUrl = null; String zkHostString = null; String collection = null; int numTasks = -1; int threads = 10; boolean resume = false; boolean index = false; int limit = -1; for (int i = 1; i < args.length; i++) { if ("-crawlId".equals(args[i])) { crawlId = args[++i]; } else if ("-fetchMode".equals(args[i])) { fetchMode = args[++i].toUpperCase(); } else if ("-threads".equals(args[i])) { // found -threads option threads = Integer.parseInt(args[++i]); } else if ("-resume".equals(args[i])) { resume = true; } else if ("-numTasks".equals(args[i])) { numTasks = Integer.parseInt(args[++i]); } else if ("-limit".equals(args[i])) { limit = Integer.parseInt(args[++i]); } else if ("-index".equals(args[i])) { index = true; } else if ("-solrUrl".equals(args[i])) { solrUrl = args[++i]; } else if ("-zk".equals(args[i])) { zkHostString = args[++i]; } else if ("-collection".equals(args[i])) { collection = args[++i]; } else { throw new IllegalArgumentException("arg " + args[i] + " not recognized"); } } return fetch(crawlId, fetchMode, batchId, threads, resume, limit, numTasks, index, solrUrl, zkHostString, collection); } public static void main(String[] args) throws Exception { LOG.info("---------------------------------------------------\n\n"); Configuration conf = NutchConfiguration.create(); // NutchMaster should run on master instance NutchMaster.startAsDaemon(conf); int res = ToolRunner.run(conf, new FetchJob(), args); System.exit(res); } }