List of usage examples for org.apache.hadoop.io Text toString
@Override
public String toString()
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java
License:Open Source License
/** * //from w w w . ja va2 s . c om * @param path * @param job * @param hasHeader * @param delim * @return * @throws IOException * @throws DMLRuntimeException */ private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException { int nrow = 0; int ncol = 0; FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); // count no of entities in the first non-header row LongWritable key = new LongWritable(); Text oneLine = new Text(); RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL); try { if (reader.next(key, oneLine)) { String cellStr = oneLine.toString().trim(); ncol = StringUtils.countMatches(cellStr, delim) + 1; } } finally { IOUtilFunctions.closeSilently(reader); } // count rows in parallel per split try { ExecutorService pool = Executors.newFixedThreadPool(_numThreads); ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>(); for (InputSplit split : splits) { tasks.add(new CountRowsTask(split, informat, job, hasHeader)); hasHeader = false; } pool.invokeAll(tasks); pool.shutdown(); // collect row counts for offset computation // early error notify in case not all tasks successful _offsets = new SplitOffsetInfos(tasks.size()); for (CountRowsTask rt : tasks) { if (!rt.getReturnCode()) throw new IOException("Count task for csv input failed: " + rt.getErrMsg()); _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow); _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount()); nrow = nrow + rt.getRowCount(); } } catch (Exception e) { throw new IOException("Threadpool Error " + e.getMessage(), e); } // allocate target matrix block based on given size; // need to allocate sparse as well since lock-free insert into target return createOutputMatrixBlock(nrow, ncol, estnnz, true, true); }
From source file:com.ibm.bi.dml.runtime.matrix.data.TextCellToRowBlockConverter.java
License:Open Source License
@Override public void convert(LongWritable k1, Text v1) { String str = v1.toString(); //handle support for matrix market format if (str.startsWith("%")) { if (str.startsWith("%%")) toIgnore = true;//from www.ja v a2 s . com hasValue = false; return; } else if (toIgnore) { toIgnore = false; hasValue = false; return; } //reset the tokenizer st.reset(str); //convert text to row block indexes.setIndexes(st.nextLong(), st.nextLong()); rowBlock.reset(1, 1); rowBlock.quickSetValue(0, 0, st.nextDouble()); hasValue = true; }
From source file:com.ibm.bi.dml.runtime.matrix.data.TextToBinaryCellConverter.java
License:Open Source License
@Override public void convert(LongWritable k1, Text v1) { String str = v1.toString(); //handle support for matrix market format if (str.startsWith("%")) { if (str.startsWith("%%")) toIgnore = true;/* w w w .ja v a 2 s . com*/ hasValue = false; return; } else if (toIgnore) { toIgnore = false; hasValue = false; return; } //reset the tokenizer st.reset(str); //convert text to matrix cell indexes.setIndexes(st.nextLong(), st.nextLong()); value.setValue(st.nextDouble()); hasValue = true; }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVAssignRowIDMapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, OutputCollector<ByteWritable, OffsetCount> out, Reporter report) throws IOException { if (first) {// w w w. j a va2s . c o m first = false; fileOffset = key.get(); outCache = out; } if (key.get() == 0 && headerFile)//getting the number of colums { if (!ignoreFirstLine) { report.incrCounter(CSVReblockMR.NUM_COLS_IN_MATRIX, outKey.toString(), value.toString().split(delim, -1).length); if (!omit(value.toString())) num++; } else realFirstLine = true; } else { if (realFirstLine) { report.incrCounter(CSVReblockMR.NUM_COLS_IN_MATRIX, outKey.toString(), value.toString().split(delim, -1).length); realFirstLine = false; } if (!omit(value.toString())) num++; } }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVReblockMapper.java
License:Open Source License
@Override public void map(LongWritable key, Text value, OutputCollector<TaggedFirstSecondIndexes, BlockRow> out, Reporter reporter) throws IOException { if (first) {/*from w ww . j ava2s . c o m*/ rowOffset = offsetMap.get(key.get()); first = false; } if (key.get() == 0 && headerFile && ignoreFirstLine) return; String[] cells = IOUtilFunctions.split(value.toString(), _delim); for (int i = 0; i < representativeMatrixes.size(); i++) for (CSVReblockInstruction ins : csv_reblock_instructions.get(i)) { idxRow = processRow(idxRow, cells, rowOffset, num, ins.output, ins.brlen, ins.bclen, ins.fill, ins.fillValue, out); } num++; }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMapper.java
License:Open Source License
@Override public void map(LongWritable rawKey, Text rawValue, OutputCollector<TaggedFirstSecondIndexes, CSVReblockMR.BlockRow> out, Reporter reporter) throws IOException { if (_first) { rowOffset = offsetMap.get(rawKey.get()); _reporter = reporter;/*w w w . j ava2s .c o m*/ _first = false; } // output the header line if (rawKey.get() == 0 && _partFileWithHeader) { tfmapper.processHeaderLine(); if (tfmapper.hasHeader()) return; } // parse the input line and apply transformation String[] words = tfmapper.getWords(rawValue); if (!tfmapper.omit(words)) { words = tfmapper.apply(words); try { tfmapper.check(words); // Perform CSV Reblock CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0); idxRow = CSVReblockMapper.processRow(idxRow, words, rowOffset, num, ins.output, ins.brlen, ins.bclen, ins.fill, ins.fillValue, out); } catch (DMLRuntimeException e) { throw new RuntimeException(e.getMessage() + ":" + rawValue.toString()); } num++; } }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMapper.java
License:Open Source License
@Override public void map(LongWritable rawKey, Text rawValue, OutputCollector<NullWritable, Text> out, Reporter reporter) throws IOException { if (_firstRecordInSplit) { _firstRecordInSplit = false;//w w w . jav a 2s .co m _reporter = reporter; // generate custom output paths so that order of rows in the // output (across part files) matches w/ that from input data set String partFileSuffix = tfmapper.getPartFileID(_rJob, rawKey.get()); Path mapOutputPath = new Path(tfmapper.getOutputPath() + "/transform-part-" + partFileSuffix); // setup the writer for mapper's output // the default part-..... files will be deleted later once the job finishes br = new BufferedWriter(new OutputStreamWriter(FileSystem.get(_rJob).create(mapOutputPath, true))); } // output the header line if (rawKey.get() == 0 && _partFileWithHeader) { _reporter = reporter; tfmapper.processHeaderLine(); if (tfmapper.hasHeader()) return; } // parse the input line and apply transformation String[] words = tfmapper.getWords(rawValue); if (!tfmapper.omit(words)) { try { words = tfmapper.apply(words); String outStr = tfmapper.checkAndPrepOutputString(words); //out.collect(NullWritable.get(), new Text(outStr)); br.write(outStr + "\n"); } catch (DMLRuntimeException e) { throw new RuntimeException(e.getMessage() + ": " + rawValue.toString()); } } }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfHelper.java
License:Open Source License
public String[] getWords(Text line) { return _delim.split(line.toString(), -1); }
From source file:com.ibm.bi.dml.runtime.transform.GTFMTDMapper.java
License:Open Source License
public void map(LongWritable rawKey, Text rawValue, OutputCollector<IntWritable, DistinctValue> out, Reporter reporter) throws IOException { if (_firstRecordInSplit) { _firstRecordInSplit = false;// www .j a va 2 s . c o m _collector = out; _offsetInPartFile = rawKey.get(); } // ignore header if (_agents.hasHeader() && rawKey.get() == 0 && _partFileWithHeader) return; _agents.prepareTfMtd(rawValue.toString()); }
From source file:com.ibm.bi.dml.runtime.transform.TfUtils.java
License:Open Source License
public String[] getWords(Text line) { return getWords(line.toString()); }