Java tutorial
/* * Copyright 2016 iychoi. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package libra.core.kmersimilarity_m; import java.io.IOException; import java.util.ArrayList; import java.util.Hashtable; import libra.common.hadoop.io.datatypes.CompressedSequenceWritable; import libra.common.json.JsonSerializer; import libra.common.kmermatch.KmerMatchFileMapping; import libra.common.kmermatch.KmerMatchResult; import libra.core.commom.CoreConfig; import libra.core.common.kmersimilarity.KmerSimilarityOutputRecord; import libra.preprocess.common.helpers.KmerIndexHelper; import libra.preprocess.common.helpers.KmerStatisticsHelper; import libra.preprocess.common.kmerstatistics.KmerStatistics; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.apache.hadoop.mapreduce.Mapper; /** * * @author iychoi */ @SuppressWarnings({ "deprecation", "cast" }) public class KmerSimilarityMapper extends Mapper<CompressedSequenceWritable, KmerMatchResult, Text, Text> { private static final Log LOG = LogFactory.getLog(KmerSimilarityMapper.class); private CoreConfig libraConfig; private KmerMatchFileMapping fileMapping; private Hashtable<String, Integer> idCacheTable; private Counter reportCounter; private JsonSerializer serializer; private int valuesLen; private double[] scoreAccumulated; private double[] tfConsineNormBase; @Override protected void setup(Context context) throws IOException, InterruptedException { this.libraConfig = CoreConfig.createInstance(context.getConfiguration()); this.fileMapping = KmerMatchFileMapping.createInstance(context.getConfiguration()); this.idCacheTable = new Hashtable<String, Integer>(); this.reportCounter = context.getCounter("KmerSimilarity", "report"); this.serializer = new JsonSerializer(); this.valuesLen = this.fileMapping.getSize(); this.scoreAccumulated = new double[this.valuesLen * this.valuesLen]; for (int i = 0; i < this.scoreAccumulated.length; i++) { this.scoreAccumulated[i] = 0; } this.tfConsineNormBase = new double[this.valuesLen]; for (int i = 0; i < this.tfConsineNormBase.length; i++) { // fill tfConsineNormBase String fastaFilename = this.fileMapping.getFastaFileFromID(i); String statisticsFilename = KmerStatisticsHelper.makeKmerStatisticsFileName(fastaFilename); Path statisticsPath = new Path(this.libraConfig.getKmerStatisticsPath(), statisticsFilename); FileSystem fs = statisticsPath.getFileSystem(context.getConfiguration()); KmerStatistics statistics = KmerStatistics.createInstance(fs, statisticsPath); this.tfConsineNormBase[i] = statistics.getTFCosineNormBase(); } } @Override protected void map(CompressedSequenceWritable key, KmerMatchResult value, Context context) throws IOException, InterruptedException { IntWritable[] valueArray = value.getVals(); Path[] kmerIndexPathArray = value.getKmerIndexPath(); // filter out empty values ArrayList<IntWritable> filteredValueArray = new ArrayList<IntWritable>(); ArrayList<Path> filteredKmerIndexPathArray = new ArrayList<Path>(); for (int i = 0; i < valueArray.length; i++) { if (valueArray[i] != null) { filteredValueArray.add(valueArray[i]); filteredKmerIndexPathArray.add(kmerIndexPathArray[i]); } } valueArray = null; kmerIndexPathArray = null; if (filteredValueArray.size() <= 1) { // skip return; } int[] fileid_arr = new int[filteredValueArray.size()]; for (int i = 0; i < filteredValueArray.size(); i++) { int fileidInt = 0; String indexFilename = filteredKmerIndexPathArray.get(i).getName(); Integer fileid = this.idCacheTable.get(indexFilename); if (fileid == null) { String fastaFilename = KmerIndexHelper.getFastaFileName(indexFilename); int id = this.fileMapping.getIDFromFastaFile(fastaFilename); this.idCacheTable.put(indexFilename, id); fileidInt = id; } else { fileidInt = fileid.intValue(); } fileid_arr[i] = fileidInt; } // compute normal double[] normal = new double[this.valuesLen]; for (int i = 0; i < this.valuesLen; i++) { normal[i] = 0; } for (int i = 0; i < filteredValueArray.size(); i++) { IntWritable arr = filteredValueArray.get(i); int freq = arr.get(); double tf = 1 + Math.log10(freq); normal[fileid_arr[i]] = ((double) tf) / this.tfConsineNormBase[fileid_arr[i]]; } accumulateScore(normal); this.reportCounter.increment(1); } private void accumulateScore(double[] normal) { for (int i = 0; i < this.valuesLen; i++) { for (int j = 0; j < this.valuesLen; j++) { this.scoreAccumulated[i * this.valuesLen + j] += normal[i] * normal[j]; } } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { KmerSimilarityOutputRecord rec = new KmerSimilarityOutputRecord(); rec.setScore(this.scoreAccumulated); String json = this.serializer.toJson(rec); context.write(new Text(" "), new Text(json)); this.fileMapping = null; this.idCacheTable.clear(); this.idCacheTable = null; this.libraConfig = null; this.scoreAccumulated = null; this.tfConsineNormBase = null; this.serializer = null; } }