Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.OutputStream; import java.io.IOException; import java.util.ArrayList; import java.nio.ByteBuffer; import java.io.RandomAccessFile; import java.io.File; import java.util.concurrent.*; import java.io.FileInputStream; import java.io.InputStream; import java.util.Arrays; import java.util.Map; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.ChecksumException; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.BlockMissingException; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.DistributedRaidFileSystem; import org.apache.hadoop.hdfs.RaidDFSUtil; import org.apache.hadoop.hdfs.protocol.*; //pmdecoder is not supposed to be access by multiple threads. public class PMDecoder extends Decoder { static { //native library for pm recovery System.loadLibrary("pmdecoder"); System.loadLibrary("pmvalidate"); } native void pmValidate(int k, int m, int n, int[] locations, int[] validLocations); //number of threads private int threadNum = 1; //data queue, input to decode private BlockingQueue[] q; //signal queue, decode to output private BlockingQueue[] p; private static final Log LOG = LogFactory.getLog("org.apache.hadoop.raid.PMDecoder"); //bufsize of request data private int encodedBufSize; /** * @param forRecovery determine the type of this decoder, for recovery or for degraded read * (someday, i may combine them into the same function) */ public PMDecoder(Configuration conf, int stripeSize, int paritySize, boolean forRecovery) { super(conf, stripeSize, paritySize); LOG.info("initial decoder: k=" + stripeSize + " m=" + paritySize + " bufSize:" + bufSize); threadNum = conf.getInt("hdfs.raid.decoder.threadnum", 1); //data queue, input to decode this.q = new BlockingQueue[threadNum]; for (int i = 0; i < threadNum; i++) q[i] = new ArrayBlockingQueue<DecodePackage>(2048 / paritySize); //signal queue, decode to output this.p = new BlockingQueue[threadNum]; for (int i = 0; i < threadNum; i++) p[i] = new ArrayBlockingQueue<Integer>(1024); Thread[] ds = new Thread[threadNum]; for (int i = 0; i < threadNum; i++) { if (forRecovery) { PMRecoveryDecoder decoder = new PMRecoveryDecoder(i); ds[i] = new Thread(decoder); } else { PMDegradedReadDecoder decoder = new PMDegradedReadDecoder(i); ds[i] = new Thread(decoder); } ds[i].start(); } LOG.info("PMDecoder 1/1"); } /* * fix erased block in a stripe. for recovery. */ protected void fixErasedBlock(FileSystem fs, Path srcFile, FileSystem parityFs, Path parityFile, long blockSize, Map<Integer, LocatedBlock> corruptStripe, File[] lbfs, int stripeIdx) throws IOException { FSDataInputStream[] inputs = new FSDataInputStream[stripeSize + paritySize]; Set<Integer> tmploc = corruptStripe.keySet(); int[] erasedLocations = new int[tmploc.size()]; int idx = 0; for (Integer loc : tmploc) { erasedLocations[idx++] = loc; } Arrays.sort(erasedLocations); int[] temp = new int[stripeSize + paritySize + 1]; pmValidate(stripeSize, paritySize, erasedLocations.length, erasedLocations, temp); int[] validErasedLocations = new int[temp[0]]; System.arraycopy(temp, 1, validErasedLocations, 0, temp[0]); Arrays.sort(validErasedLocations); if (erasedLocations.length != validErasedLocations.length) LOG.info("fail pattern is invalidate"); encodedBufSize = bufSize * validErasedLocations.length / paritySize; PMStreamFactory sf = new PMStreamFactory(fs, srcFile, parityFs, parityFile, stripeIdx); sf.buildStream(inputs, validErasedLocations); long srcFileSize = fs.getFileStatus(srcFile).getLen(); long parityFileSize = parityFs.getFileStatus(parityFile).getLen(); long[] limits = new long[erasedLocations.length]; for (int i = 0; i < limits.length; i++) { long remaining = 0; if (erasedLocations[i] < stripeSize) remaining = srcFileSize - corruptStripe.get(erasedLocations[i]).getStartOffset(); else remaining = parityFileSize - corruptStripe.get(erasedLocations[i]).getStartOffset(); limits[i] = Math.min(blockSize, remaining); } writeFixedBlock(inputs, erasedLocations, validErasedLocations, corruptStripe, lbfs, limits, sf); } /** * PMStreamFactory is for generating FSDataInputStreams * The streams need to be regenerated if the validErasedLocations is changed */ class PMStreamFactory { private FileSystem fs = null; private Path srcFile = null; private FileSystem parityFs = null; private Path parityFile = null; private long stripeIdx = -1; public PMStreamFactory(FileSystem fs, Path srcFile, FileSystem parityFs, Path parityFile, long stripeIdx) { this.fs = fs; this.parityFs = parityFs; this.srcFile = srcFile; this.parityFile = parityFile; this.stripeIdx = stripeIdx; } public void buildStream(FSDataInputStream[] inputs, int[] validErasedLocations) throws IOException { buildStream(inputs, validErasedLocations, 0); } public void buildStream(FSDataInputStream[] inputs, int[] validErasedLocations, int skipInBlock) throws IOException { FileStatus srcStat = fs.getFileStatus(srcFile); FileStatus parityStat = parityFs.getFileStatus(parityFile); long blockSize = srcStat.getBlockSize(); for (int i = 0, j = 0; i < inputs.length; i++) { long offset = 0; if (i < stripeSize) offset = blockSize * (stripeIdx * stripeSize + i); else offset = blockSize * (stripeIdx * paritySize + i - stripeSize); if (j >= validErasedLocations.length || i != validErasedLocations[j]) { if (i < stripeSize) { if (offset > srcStat.getLen()) { inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(offset + blockSize)); } else { inputs[i] = fs.open(srcFile, conf.getInt("io.file.buffer.size", 64 * 1024), stripeSize, paritySize, validErasedLocations.length, validErasedLocations); inputs[i].seek(offset); } } else { inputs[i] = parityFs.open(parityFile, conf.getInt("io.file.buffer.size", 64 * 1024), stripeSize, paritySize, validErasedLocations.length, validErasedLocations); inputs[i].seek(offset); } } else { inputs[i] = new FSDataInputStream(new RaidUtils.ZeroInputStream(offset + blockSize)); j++; } } } public void closeStreams(FSDataInputStream[] inputs) throws IOException { for (FSDataInputStream s : inputs) if (s != null) s.close(); } } /** * fix Erased block, for degraded read */ public void fixErasedBlock(FileSystem fs, Path srcFile, FileSystem parityFs, Path parityFile, long blockSize, long errorOffset, long limit, OutputStream out) throws IOException { int stripeIdx = (int) (errorOffset / blockSize / stripeSize); PMStreamFactory sf = new PMStreamFactory(fs, srcFile, parityFs, parityFile, stripeIdx); int[] erasedLocations = new int[1]; erasedLocations[0] = ((int) (errorOffset / blockSize)) % stripeSize; int[] temp = new int[stripeSize + paritySize + 1]; pmValidate(stripeSize, paritySize, erasedLocations.length, erasedLocations, temp); int[] validErasedLocations = new int[temp[0]]; System.arraycopy(temp, 1, validErasedLocations, 0, temp[0]); FSDataInputStream[] inputs = new FSDataInputStream[stripeSize + paritySize]; encodedBufSize = bufSize * validErasedLocations.length / paritySize; sf.buildStream(inputs, validErasedLocations); byte[] buf = new byte[(int) limit]; writeFixedBlock(inputs, erasedLocations, validErasedLocations, limit, buf, sf); out.write(buf, 0, (int) limit); } /** * builder * keep the information that need by PMDegradedReadDecoder/PMRecoveryDecoder to recovery a stripe */ class DecodePackage { public long[] limits; public int[] erasedLocations; public byte[] outBuf; public long limit; public File[] lfs; public int[] validErasedLocations; ByteBuffer buf; DecodePackage(int[] erasedLocations, int[] validErasedLocations, ByteBuffer buf) { this.erasedLocations = erasedLocations; this.validErasedLocations = validErasedLocations; this.buf = buf; } DecodePackage limit(long limit) { this.limit = limit; return this; } DecodePackage limits(long[] limits) { this.limits = limits; return this; } DecodePackage localFiles(File[] lfs) { this.lfs = lfs; return this; } DecodePackage outputBuffer(byte[] outBuf) { this.outBuf = outBuf; return this; } } void writeFixedBlock(FSDataInputStream[] inputs, int[] erasedLocations, int[] validErasedLocations, long limit, byte[] outBuf, PMStreamFactory sf) throws IOException { int seq = 0; for (long read = 0; read < limit;) { int failNum = validErasedLocations.length; int bufOffset = encodedBufSize * (stripeSize + paritySize - failNum); ByteBuffer buf = ByteBuffer.allocate(bufOffset + 64); buf.putInt(bufOffset, seq); boolean important = false; //last threadNum# packet checked if ((limit - read + bufSize - 1) / bufSize <= threadNum) { important = true; buf.put(bufOffset + 4, (byte) 1); } else { buf.put(bufOffset + 4, (byte) 0); } LOG.info("anchor Decode_stripe " + seq + " Data_reading " + System.nanoTime()); //read packets buf.rewind(); validErasedLocations = readFromInputs(inputs, validErasedLocations, buf, sf, seq); LOG.info("anchor Decode_stripe " + seq + " Data_read " + System.nanoTime()); buf.rewind(); int toRead = (int) Math.min((long) bufSize, limit - read); //finding the best ring buffer int remain = -1; int chosen = -1; for (int i = 0; i < threadNum; i++) { int rc = q[i].remainingCapacity(); if (remain < rc) { remain = rc; chosen = i; } } if (important) { chosen = (int) (((limit - read + bufSize - 1) / bufSize - 1) % threadNum); } DecodePackage dp = (new DecodePackage(erasedLocations, validErasedLocations, buf)).limit(limit) .outputBuffer(outBuf); //dispatch boolean flag = true; while (flag) { flag = false; try { q[chosen].put(dp); } catch (InterruptedException e) { Thread.currentThread().interrupt(); flag = true; } } LOG.info("anchor Decode_stripe " + seq + " Data_pushed " + System.nanoTime()); seq++; read += toRead; } //waiting for the end of the decode for (int i = 0; i < threadNum; i++) { boolean flag = true; while (flag) { flag = false; try { p[i].take(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); flag = true; } } } } void writeFixedBlock(FSDataInputStream[] inputs, int[] erasedLocations, int[] validErasedLocations, Map<Integer, LocatedBlock> corruptStripe, File[] lbfs, long[] limits, PMStreamFactory sf) throws IOException { long limit = 0; for (int i = 0; i < limits.length; i++) if (limit < limits[i]) limit = limits[i]; int seq = 0; for (long read = 0; read < limit;) { int failNum = validErasedLocations.length; int bufOffset = encodedBufSize * (stripeSize + paritySize - failNum); ByteBuffer buf = ByteBuffer.allocate(bufOffset + 64); buf.putInt(bufOffset, seq); //LOG.info("bufOffset: "+bufOffset+"encodedBufSize: "+encodedBufSize); boolean important = false; //last threadNum# packet checked if ((limit - read + bufSize - 1) / bufSize <= threadNum) { important = true; buf.put(bufOffset + 4, (byte) 1); } else { buf.put(bufOffset + 4, (byte) 0); } LOG.info("anchor Decode_stripe " + seq + " Data_reading " + System.nanoTime()); //read packets buf.rewind(); validErasedLocations = readFromInputs(inputs, validErasedLocations, buf, sf, seq); LOG.info("anchor Decode_stripe " + seq + " Data_read " + System.nanoTime()); int toRead = (int) Math.min((long) bufSize, limit - read); buf.rewind(); //finding the best ring buffer int remain = -1; int chosen = -1; for (int i = 0; i < threadNum; i++) { int rc = q[i].remainingCapacity(); if (remain < rc) { remain = rc; chosen = i; } } if (important) { chosen = (int) ((((limit - read) + bufSize - 1) / bufSize - 1) % threadNum); } DecodePackage dp = (new DecodePackage(erasedLocations, validErasedLocations, buf)).limits(limits) .localFiles(lbfs); //dispatch boolean flag = true; while (flag) { flag = false; try { q[chosen].put(dp); } catch (InterruptedException e) { Thread.currentThread().interrupt(); flag = true; } } LOG.info("anchor Decode_stripe " + seq + " Data_pushed " + System.nanoTime()); seq++; read += toRead; } //waiting for the end of the decode for (int i = 0; i < threadNum; i++) { boolean flag = true; while (flag) { flag = false; try { p[i].take(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); flag = true; } } } } int[] readFromInputs(FSDataInputStream[] inputs, int[] validErasedLocations, ByteBuffer buf, PMStreamFactory sf, int seq) throws IOException { boolean flag = true; while (flag) { flag = false; // For every input, read some data = bufSize for (int i = 0, j = 0; i < inputs.length; i++) { if (j >= validErasedLocations.length || i != validErasedLocations[j]) { try { LOG.info("read input:" + i + " encoded bs:" + encodedBufSize + " " + System.nanoTime()); RaidUtils.readTillEnd(inputs[i], readBufs[i], encodedBufSize, true); continue; } catch (BlockMissingException e) { LOG.error("Encountered BlockMissingException in stream " + i); } catch (ChecksumException e) { LOG.error("Encountered ChecksumException in stream " + i); } } else { j++; continue; } // too many fails if (validErasedLocations.length == paritySize) { String msg = "Too many read errors"; LOG.error(msg); throw new IOException(msg); } // read fail, need to rebuild the stream. int[] newErasedLocations = new int[validErasedLocations.length + 1]; for (int k = 0; k < validErasedLocations.length; k++) { newErasedLocations[k] = validErasedLocations[k]; } newErasedLocations[newErasedLocations.length - 1] = i; int[] temp = new int[stripeSize + paritySize + 1]; pmValidate(stripeSize, paritySize, newErasedLocations.length, newErasedLocations, temp); validErasedLocations = new int[temp[0]]; System.arraycopy(temp, 1, validErasedLocations, 0, temp[0]); Arrays.sort(validErasedLocations); sf.closeStreams(inputs); sf.buildStream(inputs, validErasedLocations, seq * bufSize); //reset flag = true; break; } } LOG.info("end read encoded bs:" + encodedBufSize + " " + System.nanoTime()); for (int i = 0, j = 0; i < inputs.length; i++) if (j >= validErasedLocations.length || i != validErasedLocations[j]) buf.put(readBufs[i], 0, encodedBufSize); else j++; LOG.info("end import encoded bs:" + encodedBufSize + " " + System.nanoTime()); return validErasedLocations; } class PMDegradedReadDecoder implements Runnable { private int[] locations = null; private final ByteBuffer inBuf; private final ByteBuffer outBuf; private final int idx; private final ByteBuffer temp; native void pmDecode(ByteBuffer in, ByteBuffer out, int k, int m, int n, int[] validLocations, int[] locations, int bufSize, ByteBuffer temp, boolean doReconstruct); PMDegradedReadDecoder(int idx) { inBuf = ByteBuffer.allocateDirect(bufSize * stripeSize + 64); outBuf = ByteBuffer.allocateDirect(bufSize + 64); temp = ByteBuffer.allocateDirect(1024); this.idx = idx; } public void run() { while (true) { DecodePackage pkg = null; try { pkg = (DecodePackage) (q[idx].take()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); continue; } inBuf.put(pkg.buf); inBuf.rewind(); int length = pkg.validErasedLocations.length; int inBufOffset = bufSize * (paritySize + stripeSize - length) * length / paritySize; byte flag = inBuf.get(4 + inBufOffset); boolean doReconstruct = false; if (!Arrays.equals(this.locations, pkg.validErasedLocations)) { LOG.info("Do Reconstruct"); doReconstruct = true; this.locations = pkg.validErasedLocations; } pmDecode(inBuf, outBuf, stripeSize, paritySize, locations.length, locations, pkg.erasedLocations, bufSize, temp, doReconstruct); int outBufOffset = bufSize; int seq = outBuf.getInt(outBufOffset); outBuf.rewind(); int len = (int) Math.min((pkg.limit - seq * bufSize), (long) bufSize); outBuf.get(pkg.outBuf, seq * bufSize, len); while (flag == (byte) 1) { try { p[idx].put(1); flag = (byte) 0; } catch (InterruptedException e) { Thread.currentThread().interrupt(); LOG.warn(e); } } } } } class PMRecoveryDecoder implements Runnable { private RandomAccessFile[] outs = null; private int[] locations = null; private final ByteBuffer inBuf; private final ByteBuffer outBuf; private final int idx; private final ByteBuffer temp; private int[] initLocations = null; native void pmDecode(ByteBuffer in, ByteBuffer out, int k, int m, int n, int[] validLocations, int[] locations, int bufSize, ByteBuffer temp, boolean doReconstruct); PMRecoveryDecoder(int idx) { inBuf = ByteBuffer.allocateDirect(bufSize * stripeSize + 64); outBuf = ByteBuffer.allocateDirect(bufSize * paritySize + 64); temp = ByteBuffer.allocateDirect(1024); this.idx = idx; } public void run() { while (true) { DecodePackage pkg = null; try { pkg = (DecodePackage) (q[idx].take()); } catch (InterruptedException e) { Thread.currentThread().interrupt(); continue; } inBuf.put(pkg.buf); inBuf.rewind(); int length = pkg.validErasedLocations.length; int inBufOffset = bufSize * (paritySize + stripeSize - length) * length / paritySize; byte flag = inBuf.get(4 + inBufOffset); int s = inBuf.getInt(inBufOffset); LOG.info("in seq: " + inBuf.getInt(inBufOffset) + " with offset: " + inBufOffset + " buf Size: " + bufSize); boolean doReconstruct = false; if (!Arrays.equals(this.locations, pkg.validErasedLocations)) { doReconstruct = true; this.locations = pkg.validErasedLocations; } LOG.info("anchor Decode_stripe " + s + " Data_decoding " + System.nanoTime()); pmDecode(inBuf, outBuf, stripeSize, paritySize, locations.length, locations, pkg.erasedLocations, bufSize, temp, doReconstruct); LOG.info("anchor Decode_stripe " + s + " Data_decoded " + System.nanoTime()); File[] fs = pkg.lfs; if (outs == null) { try { outs = new RandomAccessFile[fs.length]; for (int i = 0; i < fs.length; i++) outs[i] = new RandomAccessFile(fs[i], "rw"); } catch (IOException e) { //need to handle this LOG.error("IOException in PMRecoveryDecoder"); return; } } int outBufOffset = bufSize * pkg.erasedLocations.length; //int iseq = inBuf.getInt(inBufOffset); //outBuf.putInt(outBufOffset, iseq); int seq = outBuf.getInt(outBufOffset); LOG.info("out seq: " + seq); outBuf.rewind(); byte[] bufarr = new byte[outBuf.remaining()]; outBuf.get(bufarr, 0, bufarr.length); try { for (int i = 0; i < outs.length; i++) { if (seq * bufSize < pkg.limits[i]) { outs[i].seek(seq * bufSize); int len = (int) Math.min(pkg.limits[i] - seq * bufSize, (long) bufSize); outs[i].write(bufarr, i * bufSize, len); } } } catch (IOException e) { LOG.error("Unexpected IOException in line 622 " + e); } while (flag == (byte) 1) { try { for (int i = 0; i < outs.length; i++) { outs[i].close(); outs[i] = null; } outs = null; } catch (IOException e) { LOG.error("Unexpected IOException in line 614 " + e); return; } try { p[idx].put(1); flag = (byte) 0; } catch (InterruptedException e) { Thread.currentThread().interrupt(); LOG.warn(e); } } } } } }