Java tutorial
/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.ec2.parser; import java.io.IOException; import java.net.URI; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.SequenceFileInputFormat; import org.commoncrawl.protocol.ParseOutput; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.JobBuilder; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; /** * EC2 ParserTask * * Spawns the EMR Job that processes CrawlLogs. * * First in a sequence of jobs that are part of the migration of data processing * from the internal cluster to EC2. This job is designed to run on EMR. It * utilizes spot instances to help reduce costs, and thus only currently uses * Mappers with (0) Reducers to help make the job more resilient to machine * failures as well as dynamic (spot) task tracker availablity. * * * @author rana * */ @SuppressWarnings("static-access") public class EC2ParserTask extends EC2TaskDataAwareTask { public static final Log LOG = LogFactory.getLog(EC2ParserTask.class); static final int LOGS_PER_ITERATION = 1000; static final Pattern CRAWL_LOG_REG_EXP = Pattern.compile("CrawlLog_ccc[0-9]{2}-[0-9]{2}_([0-9]*)"); static final int MAX_SIMULTANEOUS_JOBS = 100; LinkedBlockingQueue<QueueItem> _queue = new LinkedBlockingQueue<QueueItem>(); Semaphore jobThreadSemaphore = null; int maxSimultaneousJobs = MAX_SIMULTANEOUS_JOBS; static Options options = new Options(); static { options.addOption(OptionBuilder.withArgName("testMode").hasArg(false).withDescription("Test Mode") .create("testMode")); options.addOption(OptionBuilder.withArgName("checkpoint").hasArg(false).withDescription("Create Checkpoint") .create("checkpoint")); } public EC2ParserTask(Configuration conf) throws Exception { super(conf); if (!conf.getBoolean(CONF_PARAM_TEST_MODE, false)) { conf.set(VALID_SEGMENTS_PATH_PROPERTY, VALID_SEGMENTS_PATH); conf.set(SEGMENT_PATH_PROPERTY, SEGMENTS_PATH); conf.set(JOB_LOGS_PATH_PROPERTY, JOB_LOGS_PATH); conf.set(CHECKPOIINTS_PATH_PROPERTY, CHECKPOINTS_PATH); jobThreadSemaphore = new Semaphore(-(MAX_SIMULTANEOUS_JOBS - 1)); } else { conf.set(VALID_SEGMENTS_PATH_PROPERTY, TEST_VALID_SEGMENTS_PATH); conf.set(SEGMENT_PATH_PROPERTY, TEST_SEGMENTS_PATH); conf.set(JOB_LOGS_PATH_PROPERTY, TEST_JOB_LOGS_PATH); jobThreadSemaphore = new Semaphore(0); maxSimultaneousJobs = 1; } FileSystem fs = FileSystem.get(new URI("s3n://aws-publicdatasets"), conf); LOG.info( "FileSystem is:" + fs.getUri() + " Scanning for candidates at path:" + CRAWL_LOG_INTERMEDIATE_PATH); TreeSet<Path> candidateSet = buildCandidateList(fs, new Path(CRAWL_LOG_INTERMEDIATE_PATH)); LOG.info("Scanning for completed segments"); List<Path> processedLogs = scanForCompletedSegments(fs, conf); LOG.info("Found " + processedLogs.size() + " processed logs"); // remove processed from candidate set ... candidateSet.removeAll(processedLogs); // ok we are ready to go .. LOG.info("There are: " + candidateSet.size() + " logs in need of parsing"); while (candidateSet.size() != 0) { ImmutableList.Builder<Path> pathBuilder = new ImmutableList.Builder<Path>(); Iterator<Path> iterator = Iterators.limit(candidateSet.iterator(), LOGS_PER_ITERATION); while (iterator.hasNext()) { pathBuilder.add(iterator.next()); iterator.remove(); } LOG.info("Queueing Parse"); queue(fs, conf, pathBuilder.build()); LOG.info("Queued Parse"); // in test mode, queue only a single segment's worth of data if (conf.getBoolean(CONF_PARAM_TEST_MODE, false)) { LOG.info("Test Mode - Queueing only a single Item"); break; } } // queue shutdown items for (int i = 0; i < maxSimultaneousJobs; ++i) { _queue.put(new QueueItem()); } } void run() { LOG.info("Starting Threads"); // startup threads .. for (int i = 0; i < maxSimultaneousJobs; ++i) { Thread thread = new Thread(new QueueTask()); thread.start(); } // ok wait for them to die LOG.info("Waiting for Queue Threads to Die"); jobThreadSemaphore.acquireUninterruptibly(); LOG.info("Queue Threads Dead. Exiting"); } static class QueueItem { QueueItem() { pathList = null; } QueueItem(FileSystem fs, Configuration conf, ImmutableList<Path> pathList) { this.conf = conf; this.fs = fs; this.pathList = pathList; } public Configuration conf; public FileSystem fs; public ImmutableList<Path> pathList; } private void queue(FileSystem fs, Configuration conf, ImmutableList<Path> paths) { try { _queue.put(new QueueItem(fs, conf, paths)); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } class QueueTask implements Runnable { @Override public void run() { while (true) { LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Running"); try { QueueItem item = _queue.take(); if (item.pathList != null) { LOG.info("Queue Thread:" + Thread.currentThread().getId() + " got item with Paths:" + item.pathList); LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Starting Job"); try { parse(item.fs, item.conf, item.pathList); } catch (IOException e) { LOG.error("Queue Thread:" + Thread.currentThread().getId() + " threw exception:" + CCStringUtils.stringifyException(e)); } } else { LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Got Shutdown Queue Item - EXITING"); break; } } catch (InterruptedException e) { } } LOG.info("Queue Thread:" + Thread.currentThread().getId() + " Released Semaphore"); jobThreadSemaphore.release(); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.addResource(new Path("/home/hadoop/conf/core-site.xml")); conf.addResource(new Path("/home/hadoop/conf/mapred-site.xml")); CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine line = parser.parse(options, args); boolean testMode = line.hasOption("testMode"); if (testMode) { LOG.info("Running in Test Mode"); conf.setBoolean(CONF_PARAM_TEST_MODE, true); } else { LOG.info("Running in Prod Mode"); } EC2ParserTask task = new EC2ParserTask(conf); task.run(); task.shutdown(); System.exit(0); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); } System.exit(1); } private static void parse(FileSystem fs, Configuration conf, ImmutableList<Path> paths) throws IOException { LOG.info("Need to Parse:" + paths.toString()); // create output path long segmentId = System.currentTimeMillis(); String segmentPathPrefix = conf.get(SEGMENT_PATH_PROPERTY); Path outputPath = new Path(S3N_BUCKET_PREFIX + segmentPathPrefix + Long.toString(segmentId)); LOG.info("Starting Map-Reduce Job. SegmentId:" + segmentId + " OutputPath:" + outputPath); // run job... JobConf jobConf = new JobBuilder("parse job", conf) .inputs(paths).inputFormat(SequenceFileInputFormat.class).keyValue(Text.class, ParseOutput.class) .mapRunner(ParserMapRunner.class).mapper(ParserMapper.class) // allow two attempts to process the split // after that, we will pick it up in post processing step .maxMapAttempts(2).maxMapTaskFailures(1000).speculativeExecution(true).numReducers(0) .outputFormat(ParserOutputFormat.class).output(outputPath).minSplitSize(134217728 * 4) .reuseJVM(1000).build(); Path jobLogsPath = new Path( S3N_BUCKET_PREFIX + conf.get(JOB_LOGS_PATH_PROPERTY) + Long.toString(segmentId)); jobConf.set("hadoop.job.history.user.location", jobLogsPath.toString()); jobConf.set("fs.default.name", S3N_BUCKET_PREFIX); jobConf.setLong("cc.segmet.id", segmentId); // set task timeout to 20 minutes jobConf.setInt("mapred.task.timeout", 20 * 60 * 1000); // set mapper runtime to max 45 minutes ..... jobConf.setLong(ParserMapper.MAX_MAPPER_RUNTIME_PROPERTY, 45 * 60 * 1000); jobConf.setOutputCommitter(OutputCommitter.class); // allow lots of failures per tracker per job jobConf.setMaxTaskFailuresPerTracker(Integer.MAX_VALUE); initializeTaskDataAwareJob(jobConf, segmentId); JobClient.runJob(jobConf); LOG.info("Job Finished. Writing Segments Manifest Files"); writeSegmentManifestFile(fs, conf, segmentId, paths); String validSegmentPathPrefix = conf.get(VALID_SEGMENTS_PATH_PROPERTY); Path manifestOutputPath = new Path(validSegmentPathPrefix + Long.toString(segmentId)); fs.mkdirs(manifestOutputPath); finalizeJob(fs, conf, jobConf, manifestOutputPath, segmentId); } private static List<Path> scanForCompletedSegments(FileSystem fs, Configuration conf) throws IOException { ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>(); String validSegmentPathPrefix = conf.get(VALID_SEGMENTS_PATH_PROPERTY); for (FileStatus fileStatus : fs.globStatus(new Path(validSegmentPathPrefix + "[0-9]*"))) { pathListBuilder.addAll(scanSegmentManifestFile(fs, fileStatus.getPath())); } return pathListBuilder.build(); } private static List<Path> scanSegmentManifestFile(FileSystem fs, Path segmentPath) throws IOException { LOG.info("Scanning Segment Manifest for segment at path:" + segmentPath); Path manifestPath = new Path(segmentPath, SEGMENT_MANIFEST_FILE); ImmutableList.Builder<Path> pathListBuilder = new ImmutableList.Builder<Path>(); for (String pathStr : textFileToList(fs, manifestPath)) { pathListBuilder.add(new Path(pathStr)); } return pathListBuilder.build(); } private static void writeSegmentManifestFile(FileSystem fs, Configuration conf, long segmentTimestamp, List<Path> logsInSegment) throws IOException { LOG.info( "Writing Segment Manifest for Segment: " + segmentTimestamp + " itemCount:" + logsInSegment.size()); ImmutableList.Builder<String> listBuilder = new ImmutableList.Builder<String>(); String validSegmentPathPrefix = conf.get(VALID_SEGMENTS_PATH_PROPERTY); for (Path logPath : logsInSegment) { listBuilder.add(logPath.toString().substring(S3N_BUCKET_PREFIX.length())); } listToTextFile(listBuilder.build(), fs, new Path(validSegmentPathPrefix + Long.toString(segmentTimestamp) + "/" + SEGMENT_MANIFEST_FILE)); } /** build a list of parse candidates sorted by timestamp * * @param fs * @param logFilePath * @return a Set of Candidates * @throws IOException */ private static TreeSet<Path> buildCandidateList(FileSystem fs, Path logFilePath) throws IOException { TreeSet<Path> candidateList = new TreeSet<Path>(new Comparator<Path>() { @Override public int compare(Path p1, Path p2) { String n1 = p1.getName(); String n2 = p2.getName(); Matcher m1 = CRAWL_LOG_REG_EXP.matcher(n1); Matcher m2 = CRAWL_LOG_REG_EXP.matcher(n2); m1.matches(); m2.matches(); Long v1 = Long.parseLong(m1.group(1)); Long v2 = Long.parseLong(m2.group(1)); return v1.compareTo(v2); } }); LOG.info("Scanning for Log Files at:" + logFilePath); FileStatus candidateItems[] = fs.globStatus(new Path(logFilePath, "CrawlLog*")); for (FileStatus candidate : candidateItems) { candidateList.add(candidate.getPath()); } return candidateList; } static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("EC2Launcher", options); } }