List of usage examples for org.apache.hadoop.fs FileSystem create
public FSDataOutputStream create(Path f) throws IOException
From source file:com.yahoo.spaclu.data.index.IndexFeatureValueSpark.java
License:Apache License
public static void main(String[] args) throws IOException { IndexFeatureValueOptions optionsFormatRawToDatabase = new IndexFeatureValueOptions(args); String inputPathString = optionsFormatRawToDatabase.getInputPath(); String outputPathString = optionsFormatRawToDatabase.getOutputPath(); String indexPathString = optionsFormatRawToDatabase.getIndexPath(); int numberOfPartitions = optionsFormatRawToDatabase.getNumberOfPartitions(); int maxCutoffThreshold = optionsFormatRawToDatabase.getMaximumCutoffThreshold(); int minCutoffThreshold = optionsFormatRawToDatabase.getMinimumCutoffThreshold(); /*//w ww . j a v a2s . com * Set<String> excludingFeatureNames = new HashSet<String>(); * excludingFeatureNames.add("login"); * excludingFeatureNames.add("time"); excludingFeatureNames.add("day"); * excludingFeatureNames.add("hms"); excludingFeatureNames.add("fail"); */ sLogger.info("Tool: " + IndexFeatureValueSpark.class.getSimpleName()); sLogger.info(" - input path: " + inputPathString); sLogger.info(" - output path: " + outputPathString); sLogger.info(" - index path: " + indexPathString); sLogger.info(" - number of partitions: " + numberOfPartitions); sLogger.info(" - maximum cutoff: " + maxCutoffThreshold); sLogger.info(" - minimum cutoff: " + minCutoffThreshold); // Create a default hadoop configuration Configuration conf = new Configuration(); // Parse created config to the HDFS FileSystem fs = FileSystem.get(conf); Path outputPath = new Path(outputPathString); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } SparkConf sparkConf = new SparkConf().setAppName(optionsFormatRawToDatabase.toString()); JavaSparkContext sc = new JavaSparkContext(sparkConf); Map<Integer, String> featureIndices = getFeatureIndices(sc.textFile(indexPathString)); List<Integer> listOfAllFeatureIndices = new LinkedList<Integer>(); List<String> listOfAllFeatureInfo = new LinkedList<String>(); Iterator<Integer> indexIter = featureIndices.keySet().iterator(); while (indexIter.hasNext()) { Integer tempKey = indexIter.next(); listOfAllFeatureIndices.add(tempKey); listOfAllFeatureInfo.add(featureIndices.get(tempKey)); } /* * * * * * * * */ JavaRDD<String> rawLines = sc.textFile(inputPathString).repartition(numberOfPartitions); JavaRDD<String[]> tokenizedLines = rawLines.map(new LineFilter(listOfAllFeatureIndices)); JavaPairRDD<Entry<Integer, String>, Long> featureValuesCounts = tokenizedLines .flatMapToPair(new FeatureValueMapper()).reduceByKey(new FeatureValueReducer()); Map<Integer, Builder<String, Long>> featureValueMapping = new Hashtable<Integer, Builder<String, Long>>(); Iterator<Tuple2<Entry<Integer, String>, Long>> iter = featureValuesCounts.collect().iterator(); while (iter.hasNext()) { Tuple2<Entry<Integer, String>, Long> temp = iter.next(); Entry<Integer, String> featureValueEntry = temp._1; int featureIndex = featureValueEntry.getKey(); String featureValue = featureValueEntry.getValue(); long featureValueCount = temp._2; if (!featureValueMapping.containsKey(featureIndex)) { Builder<String, Long> mapBuilder = new Builder<String, Long>(Ordering.natural()); featureValueMapping.put(featureIndex, mapBuilder); } featureValueMapping.get(featureIndex).put(featureValue, featureValueCount); } Preconditions.checkArgument(featureValueMapping.size() == listOfAllFeatureIndices.size()); String outputFeaturePathString = outputPathString + "feature" + Settings.SEPERATOR; fs.mkdirs(new Path(outputFeaturePathString)); String outputFeatureNamePathString = outputPathString + "feature.dat"; Path outputFeatureNamePath = new Path(outputFeatureNamePathString); PrintWriter featureNamePrinterWriter = new PrintWriter(fs.create(outputFeatureNamePath), true); List<Integer> listOfFeatureIndicesToKeep = new LinkedList<Integer>(); Map<Integer, Map<String, Integer>> featureValueIndex = new Hashtable<Integer, Map<String, Integer>>(); for (int d = 0; d < featureValueMapping.size(); d++) { Map<String, Integer> valueToIndex = new Hashtable<String, Integer>(); Map<Integer, String> indexToValue = new Hashtable<Integer, String>(); ImmutableSortedMap<String, Long> immutableSortedMap = featureValueMapping.get(d).build(); for (String keyString : immutableSortedMap.keySet()) { valueToIndex.put(keyString, valueToIndex.size()); indexToValue.put(indexToValue.size(), keyString); } if (valueToIndex.size() <= minCutoffThreshold || valueToIndex.size() > maxCutoffThreshold) { sLogger.info("Feature (" + listOfAllFeatureInfo.get(d) + ") contains " + valueToIndex.size() + " values, skip..."); continue; } else { sLogger.info("Feature (" + listOfAllFeatureInfo.get(d) + ") contains " + valueToIndex.size() + " values."); listOfFeatureIndicesToKeep.add(listOfAllFeatureIndices.get(d)); featureNamePrinterWriter.println(listOfAllFeatureInfo.get(d)); } String outputFeatureIndexPathString = outputFeaturePathString + "index" + Settings.UNDER_SCORE + featureValueIndex.size() + ".dat"; Path outputIndexPath = new Path(outputFeatureIndexPathString); featureValueIndex.put(featureValueIndex.size(), valueToIndex); PrintWriter featureValueIndexPrinterWriter = new PrintWriter(fs.create(outputIndexPath), true); for (int i = 0; i < indexToValue.size(); i++) { featureValueIndexPrinterWriter.println("" + i + Settings.TAB + indexToValue.get(i) + Settings.TAB + immutableSortedMap.get(indexToValue.get(i))); } featureValueIndexPrinterWriter.close(); } featureNamePrinterWriter.close(); JavaRDD<String[]> filteredLines = rawLines.map(new LineFilter(listOfFeatureIndicesToKeep)); JavaRDD<FeatureIntegerVector> indexedData = filteredLines.map(new FeatureValueIndexer(featureValueIndex)); String outputDataPathString = outputPathString + "data"; Path outputDataPath = new Path(outputDataPathString); if (fs.exists(outputDataPath)) { fs.delete(outputDataPath, true); } indexedData.saveAsTextFile(outputDataPathString); sc.stop(); }
From source file:com.yahoo.storm.yarn.Util.java
License:Open Source License
@SuppressWarnings("rawtypes") static Path createConfigurationFileInFs(FileSystem fs, String appHome, Map stormConf, YarnConfiguration yarnConf) throws IOException { // dump stringwriter's content into FS conf/storm.yaml Path confDst = new Path(fs.getHomeDirectory(), appHome + Path.SEPARATOR + STORM_CONF_PATH_STRING); Path dirDst = confDst.getParent(); fs.mkdirs(dirDst);//from w w w . j av a 2s . com //storm.yaml FSDataOutputStream out = fs.create(confDst); Yaml yaml = new Yaml(); OutputStreamWriter writer = new OutputStreamWriter(out); rmNulls(stormConf); yaml.dump(stormConf, writer); writer.close(); out.close(); //yarn-site.xml Path yarn_site_xml = new Path(dirDst, "yarn-site.xml"); out = fs.create(yarn_site_xml); writer = new OutputStreamWriter(out); yarnConf.writeXml(writer); writer.close(); out.close(); //logback.xml Path logback_xml = new Path(dirDst, "logback.xml"); out = fs.create(logback_xml); CreateLogbackXML(out); out.close(); return dirDst; }
From source file:com.yahoo.ycsb.bulk.hbase.BulkDataGeneratorJob.java
License:Apache License
/** Create the input file used for launching the maps */ void createInputFile(Job job, String workdir) throws IOException { Configuration conf = job.getConfiguration(); FileSystem fs = FileSystem.get(conf); Path inpath = new Path(workdir + "/inputkeyranges.txt"); PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(inpath))); long start = conf.getLong(ARG_KEY_RANGE_START, 0); long end = conf.getLong(ARG_KEY_RANGE_END, 0); int parts = conf.getInt(ARG_KEY_RANGE_PARTITIONS, 1); writeRanges(start, end, parts, out); out.close();/* w w w . ja va 2s . c om*/ TextInputFormat.setInputPaths(job, inpath); // NLineInputFormat.setInputPaths(job, inpath); /* compute the max input split size */ // long max_split = fs.getFileStatus( inpath ).getLen() / parts; // TextInputFormat.setMaxInputSplitSize(job, max_split); // JobConf jc = new JobConf(conf); // jc.setNumMapTasks(parts); }
From source file:com.yahoo.ycsb.bulk.hbase.BulkDataGeneratorJob.java
License:Apache License
int createSplitsFile(Configuration conf, String splitsFile) throws IOException, InvalidInputException { int splitCount = conf.getInt(ARG_KEY_SPLIT_COUNT, 0); if (splitCount <= 0) { throw new InvalidInputException( "Invalid or unspecified split count:" + splitCount + "\nSpecify it in: " + ARG_KEY_SPLIT_COUNT); }/*from w w w.j a v a2 s . c o m*/ String rowPrefix = conf.get(ARG_KEY_ROW_PREFIX, "row"); String rowFormat = DataGenerator.getKeyFormat(rowPrefix); boolean hashKeys = conf.getBoolean(ARG_KEY_HASH_KEYS, false); long start = conf.getInt(ARG_KEY_RANGE_START, 0); long end = conf.getInt(ARG_KEY_RANGE_END, 0); FileSystem fs = FileSystem.get(conf); Path splitsPath = new Path(splitsFile); Path plainPath = new Path(splitsFile + "-debug"); PrintStream out = new PrintStream(new BufferedOutputStream(fs.create(splitsPath))); PrintStream plain = new PrintStream(new BufferedOutputStream(fs.create(plainPath))); if (hashKeys) { start = conf.getInt(ARG_KEY_HASHED_RANGE_START, 0); end = conf.getInt(ARG_KEY_HASHED_RANGE_END, Integer.MAX_VALUE); } long rangeSize = Math.max(1, (end - start + 1) / (splitCount + 1)); long rangeStart = start + rangeSize; System.err.println("Generating splits file: " + splitsFile + "\nrangeStart:" + rangeStart + "\nrangeSize: " + rangeSize + "\nsplitCount: " + splitCount + "\nrangeEnd: " + end); int i = 0; try { while (rangeStart < end && splitCount > 0) { out.println(new String(Base64.encodeBase64(String.format(rowFormat, rangeStart).getBytes()))); plain.println(String.format(rowFormat, rangeStart)); rangeStart += rangeSize; splitCount--; i++; } } finally { out.close(); plain.close(); } System.err.println("Splits created: " + i); return i; }
From source file:com.yss.util.YarnUtil.java
License:Open Source License
@SuppressWarnings("rawtypes") public static Path createConfigurationFileInFs(FileSystem fs, String appHome, Map stormConf, YarnConfiguration yarnConf) throws IOException { // dump stringwriter's content into FS conf/storm.yaml Path confDst = new Path(fs.getHomeDirectory(), appHome + Path.SEPARATOR + STORM_CONF_PATH_STRING); Path dirDst = confDst.getParent(); fs.mkdirs(dirDst);//from w w w. j a v a 2 s . c o m //storm.yaml FSDataOutputStream out = fs.create(confDst); Yaml yaml = new Yaml(); OutputStreamWriter writer = new OutputStreamWriter(out); rmNulls(stormConf); yaml.dump(stormConf, writer); writer.close(); out.close(); //yarn-site.xml Path yarn_site_xml = new Path(dirDst, "yarn-site.xml"); out = fs.create(yarn_site_xml); writer = new OutputStreamWriter(out); yarnConf.writeXml(writer); writer.close(); out.close(); return dirDst; }
From source file:contrail.stages.GraphStats.java
License:Open Source License
protected void writeN50StatsToFile(ArrayList<GraphN50StatsData> records) { String outputDir = (String) stage_options.get("outputpath"); Path outputPath = new Path(outputDir, "n50stats.avro"); FileSystem fs = null; try {/* ww w . j a va 2s . c o m*/ fs = FileSystem.get(getConf()); } catch (IOException e) { throw new RuntimeException("Can't get filesystem: " + e.getMessage()); } // Write the data to the file. Schema schema = records.get(0).getSchema(); DatumWriter<GraphN50StatsData> datumWriter = new SpecificDatumWriter<GraphN50StatsData>(schema); DataFileWriter<GraphN50StatsData> writer = new DataFileWriter<GraphN50StatsData>(datumWriter); try { FSDataOutputStream outputStream = fs.create(outputPath); writer.create(schema, outputStream); for (GraphN50StatsData stats : records) { writer.append(stats); } writer.close(); } catch (IOException exception) { fail("There was a problem writing the N50 stats to an avro file. " + "Exception: " + exception.getMessage()); } }
From source file:contrail.stages.GraphStats.java
License:Open Source License
protected void writeTopNContigs(List<Integer> lengths) { String outputDir = (String) stage_options.get("outputpath"); Path outputPath = new Path(outputDir, "topn_contigs.avro"); FileSystem fs = null; try {//from w w w . j a v a 2 s . co m fs = FileSystem.get(getConf()); } catch (IOException e) { throw new RuntimeException("Can't get filesystem: " + e.getMessage()); } // Write the data to the file. Schema schema = Schema.create(Schema.Type.INT); DatumWriter<Integer> datumWriter = new SpecificDatumWriter<Integer>(schema); DataFileWriter<Integer> writer = new DataFileWriter<Integer>(datumWriter); try { FSDataOutputStream outputStream = fs.create(outputPath); writer.create(schema, outputStream); for (Integer record : lengths) { writer.append(record); } writer.close(); } catch (IOException exception) { fail("There was a problem writing the top N lengths to an avro file. " + "Exception: " + exception.getMessage()); } }
From source file:corner.hadoop.services.impl.HdfsAccessorProxy.java
License:Apache License
/** * @see corner.hadoop.services.DistributedResourceAccessor#putFile(java.lang.String, * java.io.InputStream)/*from w ww. j ava2s . c o m*/ */ @Override public void putFile(String filePath, InputStream is) throws IOException { Path dstPath = new Path(filePath); FileSystem dstFs = dstPath.getFileSystem(getConf()); FSDataOutputStream out = dstFs.create(dstPath); try { IOUtils.copyBytes(is, out, getConf(), false); } finally { out.close(); } }
From source file:corner.services.hadoop.impl.HdfsAccessorProxy.java
License:Apache License
/** * @see corner.services.hadoop.DistributedResourceAccessor#putFile(java.lang.String, * java.io.InputStream)/*from w ww. j a v a2 s. c o m*/ */ public void putFile(String filePath, InputStream is) throws IOException { Path dstPath = new Path(filePath); FileSystem dstFs = dstPath.getFileSystem(getConf()); FSDataOutputStream out = dstFs.create(dstPath); try { IOUtils.copyBytes(is, out, getConf(), false); } finally { out.close(); } }
From source file:crunch.MaxTemperature.java
License:Apache License
private void writeFile(FileSystem fileSys, Path name) throws IOException { FSDataOutputStream stm = fileSys.create(name); // XXX FileSystem.create(Path) stm.close();//ww w .ja v a2 s.c o m }