Java tutorial
/** * Copyright 2011 Yusuke Matsubara * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.wikimedia.wikihadoop; import java.io.*; import java.util.*; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.io.Text; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.*; import org.apache.hadoop.io.compress.*; import java.util.regex.*; /** A InputFormat implementation that splits a Wikimedia Dump File into page fragments, and emits them as input records. * The record reader embedded in this input format converts a page into a sequence of page-like elements, each of which contains two consecutive revisions. Output is given as keys with empty values. * * For example, Given the following input containing two pages and four revisions, * <pre><code> * <page> * <title>ABC</title> * <id>123</id> * <revision> * <id>100</id> * .... * </revision> * <revision> * <id>200</id> * .... * </revision> * <revision> * <id>300</id> * .... * </revision> * </page> * <page> * <title>DEF</title> * <id>456</id> * <revision> * <id>400</id> * .... * </revision> * </page> * </code></pre> * it will produce four keys like this: * <pre><code> * <page> * <title>ABC</title> * <id>123</id> * <revision><revision beginningofpage="true"><text xml:space="preserve"></text></revision><revision> * <id>100</id> * .... * </revision> * </page> * </code></pre> * <pre><code> * <page> * <title>ABC</title> * <id>123</id> * <revision> * <id>100</id> * .... * </revision> * <revision> * <id>200</id> * .... * </revision> * </page> * </code></pre> * <pre><code> * <page> * <title>ABC</title> * <id>123</id> * <revision> * <id>200</id> * .... * </revision> * <revision> * <id>300</id> * .... * </revision> * </page> * </code></pre> * <pre><code> * <page> * <title>DEF</title> * <id>456</id> * <revision><revision beginningofpage="true"><text xml:space="preserve"></text></revision><revision> * <id>400</id> * .... * </revision> * </page> * </code></pre> */ public class StreamWikiDumpInputFormat extends KeyValueTextInputFormat { private static final String KEY_EXCLUDE_PAGE_PATTERN = "org.wikimedia.wikihadoop.excludePagesWith"; private static final String KEY_PREVIOUS_REVISION = "org.wikimedia.wikihadoop.previousRevision"; private static final String KEY_SKIP_FACTOR = "org.wikimedia.wikihadoop.skipFactor"; private CompressionCodecFactory compressionCodecs = null; public void configure(JobConf conf) { this.compressionCodecs = new CompressionCodecFactory(conf); } protected boolean isSplitable(FileSystem fs, Path file) { final CompressionCodec codec = compressionCodecs.getCodec(file); if (null == codec) { return true; } return codec instanceof SplittableCompressionCodec; } /** * Generate the list of files and make them into FileSplits. * @param job the job context * @throws IOException */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { LOG.info("StreamWikiDumpInputFormat.getSplits job=" + job + " n=" + numSplits); InputSplit[] oldSplits = super.getSplits(job, numSplits); List<InputSplit> splits = new ArrayList<InputSplit>(); FileStatus[] files = listStatus(job); // Save the number of input files for metrics/loadgen job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long minSize = job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1); long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); for (FileStatus file : files) { if (file.isDirectory()) { throw new IOException("Not a file: " + file.getPath()); } long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); LOG.info(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize)); //System.err.println(String.format("goalsize=%d splitsize=%d blocksize=%d", goalSize, splitSize, blockSize)); for (InputSplit x : getSplits(job, file, pageBeginPattern, splitSize)) splits.add(x); } System.err.println("splits=" + splits); return splits.toArray(new InputSplit[splits.size()]); } private FileSplit makeSplit(Path path, long start, long size, NetworkTopology clusterMap, BlockLocation[] blkLocations) throws IOException { return makeSplit(path, start, size, getSplitHosts(blkLocations, start, size, clusterMap)); } public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath(); long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); SplitCompressionInputStream is = in.getSplitCompressionInputStream(); long start = 0; long skip = 0; if (is != null) { start = is.getAdjustedStart(); length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); SplitCompressionInputStream cin = in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { //Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } return splits; } public RecordReader<Text, Text> getRecordReader(final InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { // handling non-standard record reader (likely StreamXmlRecordReader) FileSplit split = (FileSplit) genericSplit; LOG.info("getRecordReader start.....split=" + split); reporter.setStatus(split.toString()); // Open the file and seek to the start of the split FileSystem fs = split.getPath().getFileSystem(job); String patt = job.get(KEY_EXCLUDE_PAGE_PATTERN); boolean prev = job.getBoolean(KEY_PREVIOUS_REVISION, true); return new MyRecordReader(split, reporter, job, fs, patt != null && !"".equals(patt) ? Pattern.compile(patt) : null, prev); } private class MyRecordReader implements RecordReader<Text, Text> { public MyRecordReader(FileSplit split, Reporter reporter, JobConf job, FileSystem fs, Pattern exclude, boolean prev) throws IOException { this.revisionBeginPattern = "<revision"; this.revisionEndPattern = "</revision>"; this.pageHeader = new DataOutputBuffer(); this.prevRevision = new DataOutputBuffer(); this.pageFooter = getBuffer("\n</page>\n".getBytes("UTF-8")); this.revHeader = getBuffer(this.revisionBeginPattern.getBytes("UTF-8")); this.firstDummyRevision = getBuffer( " beginningofpage=\"true\"><text xml:space=\"preserve\"></text></revision>\n" .getBytes("UTF-8")); this.bufInRev = new DataOutputBuffer(); this.bufBeforeRev = new DataOutputBuffer(); this.split = split; this.fs = fs; this.exclude = exclude; this.recordPrevRevision = prev; SeekableInputStream in = SeekableInputStream.getInstance(split, fs, compressionCodecs); SplitCompressionInputStream sin = in.getSplitCompressionInputStream(); if (sin == null) { this.start = split.getStart(); this.end = split.getStart() + split.getLength(); } else { this.start = sin.getAdjustedStart(); this.end = sin.getAdjustedEnd() + 1; } this.reporter = reporter; allWrite(this.prevRevision, this.firstDummyRevision); this.currentPageNum = -1; this.pageBytes = getPageBytes(this.split, this.fs, compressionCodecs, this.reporter); this.istream = SeekableInputStream.getInstance(this.split, this.fs, compressionCodecs); this.matcher = new ByteMatcher(this.istream, this.istream); this.seekNextRecordBoundary(); this.reporter.incrCounter(WikiDumpCounters.WRITTEN_REVISIONS, 0); this.reporter.incrCounter(WikiDumpCounters.WRITTEN_PAGES, 0); } @Override public Text createKey() { return new Text(); } @Override public Text createValue() { return new Text(); } @Override public void close() throws IOException { this.istream.close(); } @Override public float getProgress() throws IOException { float rate = 0.0f; if (this.end == this.start) { rate = 1.0f; } else { rate = ((float) (this.getPos() - this.start)) / ((float) (this.end - this.start)); } return rate; } @Override public long getPos() throws IOException { return this.matcher.getPos(); } public synchronized long getReadBytes() throws IOException { return this.matcher.getReadBytes(); } @Override synchronized public boolean next(Text key, Text value) throws IOException { //LOG.info("StreamWikiDumpInputFormat: split=" + split + " start=" + this.start + " end=" + this.end + " pos=" + this.getPos()); while (true) { if (this.nextPageBegin() < 0) { return false; } //System.err.println("0.2 check pos="+this.getPos() + " end="+this.end);//! if (this.currentPageNum >= this.pageBytes.size() / 2 || this.getReadBytes() >= this.tailPageEnd()) { return false; } //System.err.println("2 move to rev from: " + this.getReadBytes());//! if (!readUntilMatch(this.revisionBeginPattern, this.bufBeforeRev) || this.getReadBytes() >= this.tailPageEnd()) { // move to the beginning of the next revision return false; } //System.err.println("2.1 move to rev to: " + this.getReadBytes());//! //System.err.println("4.5 check if exceed: " + this.getReadBytes() + " " + nextPageBegin() + " " + prevPageEnd());//! if (this.getReadBytes() >= this.nextPageBegin()) { // int off = (int)(this.nextPageBegin() - this.prevPageEnd()); int off = findIndex(pageBeginPattern.getBytes("UTF-8"), this.bufBeforeRev); if (off >= 0) { offsetWrite(this.pageHeader, off, this.bufBeforeRev); allWrite(this.prevRevision, this.firstDummyRevision); this.currentPageNum++; if (this.exclude != null && this.exclude.matcher(new String(this.pageHeader.getData(), "UTF-8")).find()) { reporter.incrCounter(WikiDumpCounters.SKIPPED_PAGES, 1); this.seekNextRecordBoundary(); } else { reporter.incrCounter(WikiDumpCounters.WRITTEN_PAGES, 1); break; } //System.err.println("4.6 exceed");//! } else { throw new IllegalArgumentException(); } } else { break; } } //System.err.println("4 read rev from: " + this.getReadBytes());//! if (!readUntilMatch(this.revisionEndPattern, this.bufInRev)) { // store the revision //System.err.println("no revision end" + this.getReadBytes() + " " + this.end);//! LOG.info("no revision end"); return false; } //System.err.println("4.1 read rev to: " + this.getReadBytes());//! //System.err.println("5 read rev pos " + this.getReadBytes());//! byte[] record = this.recordPrevRevision ? writeInSequence(new DataOutputBuffer[] { this.pageHeader, this.prevRevision, this.revHeader, this.bufInRev, this.pageFooter }) : writeInSequence(new DataOutputBuffer[] { this.pageHeader, this.bufInRev, this.pageFooter }); key.set(record); //System.out.print(key.toString());//! value.set(""); this.reporter.setStatus("StreamWikiDumpInputFormat: write new record pos=" + this.getPos() + " bytes=" + this.getReadBytes() + " next=" + this.nextPageBegin() + " prev=" + this.prevPageEnd()); reporter.incrCounter(WikiDumpCounters.WRITTEN_REVISIONS, 1); if (this.recordPrevRevision) { allWrite(this.prevRevision, this.bufInRev); } return true; } public synchronized void seekNextRecordBoundary() throws IOException { if (this.getReadBytes() < this.nextPageBegin()) { long len = this.nextPageBegin() - this.getReadBytes(); this.matcher.skip(len); } } private synchronized boolean readUntilMatch(String textPat, DataOutputBuffer outBufOrNull) throws IOException { if (outBufOrNull != null) outBufOrNull.reset(); return this.matcher.readUntilMatch(textPat, outBufOrNull, this.end); } private long tailPageEnd() { if (this.pageBytes.size() > 0) { return this.pageBytes.get(this.pageBytes.size() - 1); } else { return 0; } } private long nextPageBegin() { if ((this.currentPageNum + 1) * 2 < this.pageBytes.size()) { return this.pageBytes.get((this.currentPageNum + 1) * 2); } else { return -1; } } private long prevPageEnd() { if (this.currentPageNum == 0) { if (this.pageBytes.size() > 0) { return this.pageBytes.get(0); } else { return 0; } } else if (this.currentPageNum * 2 - 1 <= this.pageBytes.size() - 1) { return this.pageBytes.get(this.currentPageNum * 2 - 1); } else { return this.pageBytes.get(this.pageBytes.size() - 1); } } private int currentPageNum; private final Pattern exclude; private final boolean recordPrevRevision; private final long start; private final long end; private final List<Long> pageBytes; private final SeekableInputStream istream; private final String revisionBeginPattern; private final String revisionEndPattern; private final DataOutputBuffer pageHeader; private final DataOutputBuffer revHeader; private final DataOutputBuffer prevRevision; private final DataOutputBuffer pageFooter; private final DataOutputBuffer firstDummyRevision; private final DataOutputBuffer bufInRev; private final DataOutputBuffer bufBeforeRev; private final FileSystem fs; private final FileSplit split; private final Reporter reporter; private final ByteMatcher matcher; } private static byte[] writeInSequence(DataOutputBuffer[] array) { int size = 0; for (DataOutputBuffer buf : array) { size += buf.getLength(); } byte[] dest = new byte[size]; int n = 0; for (DataOutputBuffer buf : array) { System.arraycopy(buf.getData(), 0, dest, n, buf.getLength()); n += buf.getLength(); } return dest; } private static DataOutputBuffer getBuffer(byte[] bytes) throws IOException { DataOutputBuffer ret = new DataOutputBuffer(bytes.length); ret.write(bytes); return ret; } private static List<Long> getPageBytes(FileSplit split, FileSystem fs, CompressionCodecFactory compressionCodecs, Reporter reporter) throws IOException { SeekableInputStream in = null; try { in = SeekableInputStream.getInstance(split, fs, compressionCodecs); long start = split.getStart(); long end = start + split.getLength(); SplitCompressionInputStream cin = in.getSplitCompressionInputStream(); if (cin != null) { start = cin.getAdjustedStart(); end = cin.getAdjustedEnd() + 1; } ByteMatcher matcher = new ByteMatcher(in, in); List<Long> ret = new ArrayList<Long>(); while (true) { if (matcher.getPos() >= end || !matcher.readUntilMatch(pageBeginPattern, null, end)) { break; } ret.add(matcher.getReadBytes() - pageBeginPattern.getBytes("UTF-8").length); if (matcher.getPos() >= end || !matcher.readUntilMatch(pageEndPattern, null, end)) { System.err.println("could not find " + pageEndPattern + ", page over a split? pos=" + matcher.getPos() + " bytes=" + matcher.getReadBytes()); //ret.add(end); break; } ret.add(matcher.getReadBytes() - pageEndPattern.getBytes("UTF-8").length); String report = String.format( "StreamWikiDumpInputFormat: find page %6d start=%d pos=%d end=%d bytes=%d", ret.size(), start, matcher.getPos(), end, matcher.getReadBytes()); reporter.setStatus(report); reporter.incrCounter(WikiDumpCounters.FOUND_PAGES, 1); LOG.info(report); } if (ret.size() % 2 == 0) { ret.add(matcher.getReadBytes()); } //System.err.println("getPageBytes " + ret);//! return ret; } finally { if (in != null) { in.close(); } } } private static void offsetWrite(DataOutputBuffer to, int fromOffset, DataOutputBuffer from) throws IOException { if (from.getLength() <= fromOffset || fromOffset < 0) { throw new IllegalArgumentException( String.format("invalid offset: offset=%d length=%d", fromOffset, from.getLength())); } byte[] bytes = new byte[from.getLength() - fromOffset]; System.arraycopy(from.getData(), fromOffset, bytes, 0, bytes.length); to.reset(); to.write(bytes); } private static void allWrite(DataOutputBuffer to, DataOutputBuffer from) throws IOException { offsetWrite(to, 0, from); } private static int findIndex(byte[] match, DataOutputBuffer from_) throws IOException { // TODO: faster string pattern match (KMP etc) int m = 0; int i; byte[] from = from_.getData(); for (i = 0; i < from_.getLength(); ++i) { if (from[i] == match[m]) { ++m; } else { m = 0; } if (m == match.length) { return i - m + 1; } } // throw new IllegalArgumentException("pattern not found: " + new String(match) + " in " + new String(from)); System.err.println( "pattern not found: " + new String(match) + " in " + new String(from, 0, from_.getLength()));//! return -1; } private static enum WikiDumpCounters { FOUND_PAGES, WRITTEN_REVISIONS, WRITTEN_PAGES, SKIPPED_PAGES } private static final String pageBeginPattern = "<page>"; private static final String pageEndPattern = "</page>"; }