Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.data; import java.io.BufferedOutputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.ArrayList; import org.apache.pig.impl.util.Spillable; import org.apache.pig.backend.hadoop.executionengine.mapreduceExec.PigMapReduce; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * A collection of Tuples. A DataBag may or may not fit into memory. * DataBag extends spillable, which means that it registers with a memory * manager. By default, it attempts to keep all of its contents in memory. * If it is asked by the memory manager to spill to disk (by a call to * spill()), it takes whatever it has in memory, opens a spill file, and * writes the contents out. This may happen multiple times. The bag * tracks all of the files it's spilled to. * * DataBag provides an Iterator interface, that allows callers to read * through the contents. The iterators are aware of the data spilling. * They have to be able to handle reading from files, as well as the fact * that data they were reading from memory may have been spilled to disk * underneath them. * * The DataBag interface assumes that all data is written before any is * read. That is, a DataBag cannot be used as a queue. If data is written * after data is read, the results are undefined. This condition is not * checked on each add or read, for reasons of speed. Caveat emptor. * * Since spills are asynchronous (the memory manager requesting a spill * runs in a separate thread), all operations dealing with the mContents * Collection (which is the collection of tuples contained in the bag) have * to be synchronized. This means that reading from a DataBag is currently * serialized. This is ok for the moment because pig execution is * currently single threaded. A ReadWriteLock was experimented with, but * it was found to be about 10x slower than using the synchronize keyword. * If pig changes its execution model to be multithreaded, we may need to * return to this issue, as synchronizing reads will most likely defeat the * purpose of multi-threading execution. * * DataBag come in several types, default, sorted, and distinct. The type * must be chosen up front, there is no way to convert a bag on the fly. */ public abstract class DataBag extends Datum implements Spillable, Iterable<Tuple> { private static final Log log = LogFactory.getLog(DataBag.class); // Container that holds the tuples. Actual object instantiated by // subclasses. protected Collection<Tuple> mContents; // Spill files we've created. These need to be removed in finalize. protected ArrayList<File> mSpillFiles; // Total size, including tuples on disk. Stored here so we don't have // to run through the disk when people ask. protected long mSize = 0; protected boolean mMemSizeChanged = false; protected long mMemSize = 0; /** * Get the number of elements in the bag, both in memory and on disk. */ public long size() { return mSize; } /** * Deprecated. Use size() instead. */ public int cardinality() { return (int) size(); } /** * Find out if the bag is sorted. */ public abstract boolean isSorted(); /** * Find out if the bag is distinct. */ public abstract boolean isDistinct(); /** * Get an iterator to the bag. For default and distinct bags, * no particular order is guaranteed. For sorted bags the order * is guaranteed to be sorted according * to the provided comparator. */ public abstract Iterator<Tuple> iterator(); /** * Deprected. Use iterator() instead. */ @Deprecated public Iterator<Tuple> content() { return iterator(); } /** * Add a tuple to the bag. * @param t tuple to add. */ public void add(Tuple t) { synchronized (mContents) { mMemSizeChanged = true; mSize++; mContents.add(t); } } /** * Add contents of a bag to the bag. * @param b bag to add contents of. */ public void addAll(DataBag b) { synchronized (mContents) { mMemSizeChanged = true; mSize += b.size(); for (Tuple t : b) { mContents.add(t); } } } // Do I need remove? I couldn't find it used anywhere. /** * Return the size of memory usage. */ @Override public long getMemorySize() { if (!mMemSizeChanged) return mMemSize; long used = 0; // I can't afford to talk through all the tuples every time the // memory manager wants to know if it's time to dump. Just sample // the first 100 and see what we get. This may not be 100% // accurate, but it's just an estimate anyway. int j; int numInMem = 0; synchronized (mContents) { numInMem = mContents.size(); // Measure only what's in memory, not what's on disk. Iterator<Tuple> i = mContents.iterator(); for (j = 0; i.hasNext() && j < 100; j++) { used += i.next().getMemorySize(); used += REF_SIZE; } } if (numInMem > 100) { // Estimate the per tuple size. Do it in integer arithmetic // (even though it will be slightly less accurate) for speed. used /= j; used *= numInMem; } mMemSize = used; mMemSizeChanged = false; return used; } /** * Clear out the contents of the bag, both on disk and in memory. * Any attempts to read after this is called will produce undefined * results. */ public void clear() { synchronized (mContents) { mContents.clear(); if (mSpillFiles != null) { for (int i = 0; i < mSpillFiles.size(); i++) { mSpillFiles.get(i).delete(); } mSpillFiles.clear(); } mSize = 0; } } /** * This method is potentially very expensive since it may require a * sort of the bag; don't call it unless you have to. */ public int compareTo(Object other) { // Do we really need to be able to compare to DataAtom and Tuple? // When does that happen? if (this == other) return 0; if (other instanceof DataBag) { DataBag bOther = (DataBag) other; if (this.size() != bOther.size()) { if (this.size() > bOther.size()) return 1; else return -1; } // Ugh, this is bogus. But I have to know if two bags have the // same tuples, regardless of order. Hopefully most of the // time the size check above will prevent this. // If either bag isn't already sorted, create a sorted bag out // of it so I can guarantee order. DataBag thisClone; DataBag otherClone; if (this instanceof SortedDataBag || this instanceof DistinctDataBag) { thisClone = this; } else { thisClone = new SortedDataBag(null); Iterator<Tuple> i = iterator(); while (i.hasNext()) thisClone.add(i.next()); } if (other instanceof SortedDataBag || this instanceof DistinctDataBag) { otherClone = bOther; } else { otherClone = new SortedDataBag(null); Iterator<Tuple> i = bOther.iterator(); while (i.hasNext()) otherClone.add(i.next()); } Iterator<Tuple> thisIt = thisClone.iterator(); Iterator<Tuple> otherIt = otherClone.iterator(); while (thisIt.hasNext() && otherIt.hasNext()) { Tuple thisT = thisIt.next(); Tuple otherT = otherIt.next(); int c = thisT.compareTo(otherT); if (c != 0) return c; } return 0; // if we got this far, they must be equal } else if (other instanceof DataAtom) { return +1; } else if (other instanceof Tuple) { return -1; } else { return -1; } } @Override public boolean equals(Object other) { return compareTo(other) == 0; } /** * Write a bag's contents to disk. * @param out DataOutput to write data to. * @throws IOException (passes it on from underlying calls). */ @Override public void write(DataOutput out) throws IOException { // We don't care whether this bag was sorted or distinct because // using the iterator to write it will guarantee those things come // correctly. And on the other end there'll be no reason to waste // time re-sorting or re-applying distinct. out.write(BAG); out.writeLong(size()); for (Tuple t : this) { t.write(out); } } /** * Read a bag from disk. * @param in DataInput to read data from. * @throws IOException (passes it on from underlying calls). */ static DataBag read(DataInput in) throws IOException { long size = in.readLong(); // Always use a default data bag, as if it was sorted or distinct // we're guaranteed it was written out that way already, and we // don't need to mess with it. DataBag ret = BagFactory.getInstance().newDefaultBag(); for (long i = 0; i < size; i++) { Tuple t = new Tuple(); t.readFields(in); ret.add(t); } return ret; } /** * This is used by FuncEvalSpec.FakeDataBag. * @param stale Set stale state. */ public void markStale(boolean stale) { } /** * Write the bag into a string. */ @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append('{'); Iterator<Tuple> it = iterator(); while (it.hasNext()) { Tuple t = it.next(); String s = t.toString(); sb.append(s); if (it.hasNext()) sb.append(", "); } sb.append('}'); return sb.toString(); } @Override public int hashCode() { int hash = 1; Iterator<Tuple> i = iterator(); while (i.hasNext()) { // Use 37 because we want a prime, and tuple uses 31. hash = 37 * hash + i.next().hashCode(); } return hash; } /** * Need to override finalize to clean out the mSpillFiles array. */ @Override protected void finalize() { if (mSpillFiles != null) { for (int i = 0; i < mSpillFiles.size(); i++) { mSpillFiles.get(i).delete(); } } } /** * Get a file to spill contents to. The file will be registered in the * mSpillFiles array. * @return stream to write tuples to. */ protected DataOutputStream getSpillFile() throws IOException { if (mSpillFiles == null) { // We want to keep the list as small as possible. mSpillFiles = new ArrayList<File>(1); } String tmpDirName = System.getProperties().getProperty("java.io.tmpdir"); File tmpDir = new File(tmpDirName); // if the directory does not exist, create it. if (!tmpDir.exists()) { log.info("Temporary directory doesn't exists. Trying to create: " + tmpDir.getAbsolutePath()); // Create the directory and see if it was successful if (tmpDir.mkdir()) { log.info("Successfully created temporary directory: " + tmpDir.getAbsolutePath()); } else { // If execution reaches here, it means that we needed to create the directory but // were not successful in doing so. // // If this directory is created recently then we can simply // skip creation. This is to address a rare issue occuring in a cluster despite the // the fact that spill() makes call to getSpillFile() in a synchronized // block. if (tmpDir.exists()) { log.info("Temporary directory already exists: " + tmpDir.getAbsolutePath()); } else { log.error("Unable to create temporary directory: " + tmpDir.getAbsolutePath()); throw new IOException("Unable to create temporary directory: " + tmpDir.getAbsolutePath()); } } } File f = File.createTempFile("pigbag", null); f.deleteOnExit(); mSpillFiles.add(f); return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(f))); } /** * Report progress to HDFS. */ protected void reportProgress() { if (PigMapReduce.reporter != null) { PigMapReduce.reporter.progress(); } } public static abstract class BagDelimiterTuple extends Tuple { } public static class StartBag extends BagDelimiterTuple { } public static class EndBag extends BagDelimiterTuple { } public static final Tuple startBag = new StartBag(); public static final Tuple endBag = new EndBag(); protected static final int MAX_SPILL_FILES = 100; }