Example usage for org.apache.hadoop.fs FileSystem create

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem create.

Prototype

public FSDataOutputStream create(Path f) throws IOException

Source Link

Document

Create an FSDataOutputStream at the indicated Path.

Usage

From source file:com.linkedin.cubert.plan.physical.GenerateDictionary.java

License:Open Source License

public static void mergeDictionaries(Configuration conf, Path dir) throws IOException {
    Map<String, CodeDictionary> dictionaries = new HashMap<String, CodeDictionary>();
    FileSystem fs = FileSystem.get(conf);

    Path currentDictPath = new Path(dir, "dictionary");
    Schema schema = getSchema();//from  w w  w  .j av  a2 s  .  c  om

    // Read the existing dictionaries
    if (fs.exists(currentDictPath)) {
        dictionaries.putAll(loadDictionary(currentDictPath.toString(), true, conf));

        // move the current dictionary to new file
        Path oldPath = new Path(dir, "_dictionary.old");
        fs.delete(oldPath, true);
        fs.rename(currentDictPath, oldPath);
    }

    // Read the new entries
    Path globPath = new Path(dir, "tmp/part-*");
    FileStatus[] allStatus = fs.globStatus(globPath);
    for (FileStatus status : allStatus) {
        DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
        DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(
                new FsInput(status.getPath(), conf), datumReader);
        GenericRecord record = null;
        while (dataFileReader.hasNext()) {
            record = dataFileReader.next();
            String colName = record.get("colname").toString();
            String colValue = record.get("colvalue").toString();

            CodeDictionary dict = dictionaries.get(colName);
            if (dict == null) {
                dict = new CodeDictionary();
                dictionaries.put(colName, dict);
            }

            dict.addKey(colValue);
        }
    }

    // Write the dictionaries back
    DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    FSDataOutputStream out = fs.create(currentDictPath);

    dataFileWriter.create(schema, out);
    Record record = new Record(schema);

    for (Map.Entry<String, CodeDictionary> entry : dictionaries.entrySet()) {
        String colName = entry.getKey();
        CodeDictionary dict = entry.getValue();

        for (String colValue : dict.keySet()) {
            int code = dict.getCodeForKey(colValue);
            record.put("colname", colName);
            record.put("colvalue", colValue);
            record.put("code", code);

            dataFileWriter.append(record);
        }
    }
    dataFileWriter.close();

}

From source file:com.linkedin.cubert.utils.CodeDictionary.java

License:Open Source License

public void write(FileSystem fs, Path path) throws IOException {
    // if the path exists, rename the existing file with ".old" suffix
    if (fs.exists(path)) {
        Path renamePath = new Path(path.toString() + ".old");
        fs.delete(renamePath, false);/*from w  w  w .j  a va2  s. c o m*/

        fs.rename(path, renamePath);
    }

    // Write data to file
    FSDataOutputStream ostream = fs.create(path);
    BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(ostream));

    for (Map.Entry<String, Integer> entry : keyToCodeMap.entrySet()) {
        String line = String.format("%s %d\n", entry.getKey(), entry.getValue());
        writer.write(line);
    }

    writer.flush();
    writer.close();
    ostream.close();
}

From source file:com.linkedin.cubert.utils.CubertMD.java

License:Open Source License

private static void writeMetaFile(String metaFilePath, HashMap<String, String> metaFileKeyValues)
        throws IOException {
    Job tempjob = new Job();
    Configuration tempconf = tempjob.getConfiguration();
    FileSystem fs = FileSystem.get(tempconf);

    FSDataOutputStream outStream = fs.create(new Path(metaFilePath + "/.meta"));
    for (String key : metaFileKeyValues.keySet())
        outStream.write((key + " " + metaFileKeyValues.get(key) + "\n").getBytes());
    outStream.flush();//from   w  w w .j  ava2  s .c o  m
    outStream.close();
}

From source file:com.linkedin.hadoop.example.WordCountJobTest.java

License:Apache License

/**
 * Simple unit test for this job. The purpose of this test is simply to demonstrate how to write
 * mini-cluster based unit tests.//from  w w  w  .  ja  v a  2  s  .  c  om
 *
 * @throws Exception If there is a problem executing the Hadoop job
 */
@Test
public void wordCountMapReduceTest() throws Exception {
    // First, write some test data to the input path
    FileSystem fileSystem = getFileSystem();
    OutputStream outputStream = fileSystem.create(new Path(_inputPath, "testFile.txt"));
    outputStream.write("Hello world test hello\nHello again world\nTesting hello world".getBytes());
    outputStream.close();

    // Now run the WordCountJob
    runJob();

    // Then check the results
    Map<String, Long> wordCounts = readWordCounts(new Path(_outputPath, "part-r-00000"));
    Assert.assertEquals(wordCounts.get("Hello"), new Long(2));
    Assert.assertEquals(wordCounts.get("world"), new Long(3));
}

From source file:com.linkedin.mapred.AvroHdfsFileWriter.java

License:Open Source License

public AvroHdfsFileWriter(JobConf conf, String path, Schema schema) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    FSDataOutputStream out = fs.create(new Path(path));
    DatumWriter<T> writer = new GenericDatumWriter<T>(schema);
    _recordWriter = new DataFileWriter<T>(writer);
    _recordWriter.create(schema, out);/*from  ww  w. j ava 2s.  c  om*/
}

From source file:com.linkedin.pinot.core.indexsegment.utils.InputOutputStreamUtils.java

License:Apache License

public static DataOutputStream getOutputStream(String filePath, FileSystemMode mode, FileSystem fs) {
    try {/*  w ww .  j a v a  2s  . com*/
        DataOutputStream is = null;
        switch (mode) {
        case DISK:
            File newFile = new File(filePath);
            if (!newFile.exists()) {
                if (!newFile.getParentFile().exists()) {
                    newFile.getParentFile().mkdirs();
                }
                newFile.createNewFile();
            }
            is = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(new File(filePath))));
            break;
        case HDFS:
            is = fs.create(new Path(filePath));
            break;
        default:
            throw new UnsupportedOperationException();
        }
        return is;
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

public void run() throws Exception {
    LOGGER.info("Starting {}", getClass().getSimpleName());

    FileSystem fs = FileSystem.get(getConf());
    Path inputPathPattern = new Path(_inputSegmentDir);

    if (fs.exists(new Path(_stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(_stagingDir), true);
    }/*from   w w w  .j  a v  a2 s  . c  om*/
    fs.mkdirs(new Path(_stagingDir));
    fs.mkdirs(new Path(_stagingDir + "/input/"));

    if (fs.exists(new Path(_outputDir))) {
        LOGGER.warn("Found the output folder, deleting it");
        fs.delete(new Path(_outputDir), true);
    }
    fs.mkdirs(new Path(_outputDir));

    List<FileStatus> inputDataFiles = new ArrayList<FileStatus>();
    FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern);
    for (FileStatus fileStatus : fileStatusArr) {
        inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath()));
    }

    for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
        FileStatus file = inputDataFiles.get(seqId);
        String completeFilePath = " " + file.getPath().toString() + " " + seqId;
        Path newOutPutFile = new Path((_stagingDir + "/input/"
                + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt"));
        FSDataOutputStream stream = fs.create(newOutPutFile);
        stream.writeUTF(completeFilePath);
        stream.flush();
        stream.close();
    }

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationJob.class);
    job.setJobName(_jobName);

    job.setMapperClass(HadoopSegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : _properties.keySet()) {
        job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString()));
    }

    if (_depsJarPath != null && _depsJarPath.length() > 0) {
        addDepsJarToDistributedCache(new Path(_depsJarPath), job);
    }

    // Submit the job for execution.
    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", _stagingDir);
    fs.delete(new Path(_stagingDir), true);
}

From source file:com.linkedin.thirdeye.bootstrap.segment.create.SegmentCreationPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationPhaseJob.class);
    job.setJobName(name);// www . j a  v  a2 s  .  c o m

    FileSystem fs = FileSystem.get(getConf());

    Configuration configuration = job.getConfiguration();

    String schemaPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEMA_PATH);
    LOGGER.info("Schema path : {}", schemaPath);
    String configPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_CONFIG_PATH);
    LOGGER.info("Config path : {}", configPath);
    Schema dataSchema = createSchema(configPath);
    LOGGER.info("Data schema : {}", dataSchema);
    String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH);
    LOGGER.info("Input path : {}", inputSegmentDir);
    String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputDir);
    String stagingDir = new File(outputDir, TEMP).getAbsolutePath();
    LOGGER.info("Staging dir : {}", stagingDir);
    String tableName = getAndSetConfiguration(configuration, SEGMENT_CREATION_SEGMENT_TABLE_NAME);
    LOGGER.info("Segment table name : {}", tableName);

    // Create temporary directory
    if (fs.exists(new Path(stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(stagingDir), true);
    }
    fs.mkdirs(new Path(stagingDir));
    fs.mkdirs(new Path(stagingDir + "/input/"));

    if (fs.exists(new Path(outputDir))) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(new Path(outputDir), true);
    }
    fs.mkdirs(new Path(outputDir));

    Path inputPathPattern = new Path(inputSegmentDir);
    List<FileStatus> inputDataFiles = Arrays.asList(fs.listStatus(inputPathPattern));
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((stagingDir + "/input/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", OBJECT_MAPPER.writeValueAsString(dataSchema));
    if (!fs.exists(new Path(schemaPath))) {
        OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(fs.create(new Path(schemaPath), false),
                dataSchema);
    }

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", stagingDir);
    fs.delete(new Path(stagingDir), true);

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());
    job.setJarByClass(BackfillPhaseJob.class);
    job.setJobName(name);/*  www . j ava2  s  .c  o m*/

    FileSystem fs = FileSystem.get(getConf());
    Configuration configuration = job.getConfiguration();

    LOGGER.info("*******************************************************************************");
    String controllerHost = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_HOST);
    String controllerPort = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_PORT);
    LOGGER.info("Controller Host : {} Controller Port : {}", controllerHost, controllerPort);
    String segmentStartTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_START_TIME);
    String segmentEndTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_END_TIME);
    long startTime = Long.valueOf(segmentStartTime);
    long endTime = Long.valueOf(segmentEndTime);
    if (Long.valueOf(segmentStartTime) > Long.valueOf(segmentEndTime)) {
        throw new IllegalStateException("Start time cannot be greater than end time");
    }
    String tableName = getAndSetConfiguration(configuration, BACKFILL_PHASE_TABLE_NAME);
    LOGGER.info("Start time : {} End time : {} Table name : {}", segmentStartTime, segmentEndTime, tableName);

    String outputPath = getAndSetConfiguration(configuration, BACKFILL_PHASE_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputPath);
    Path backfillDir = new Path(outputPath);
    if (fs.exists(backfillDir)) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(backfillDir, true);
    }
    Path downloadDir = new Path(backfillDir, DOWNLOAD);
    LOGGER.info("Creating download dir : {}", downloadDir);
    fs.mkdirs(downloadDir);
    Path inputDir = new Path(backfillDir, INPUT);
    LOGGER.info("Creating input dir : {}", inputDir);
    fs.mkdirs(inputDir);
    Path outputDir = new Path(backfillDir, OUTPUT);
    LOGGER.info("Creating output dir : {}", outputDir);

    BackfillControllerAPIs backfillControllerAPIs = new BackfillControllerAPIs(controllerHost,
            Integer.valueOf(controllerPort), tableName);

    LOGGER.info("Downloading segments in range {} to {}", startTime, endTime);
    List<String> allSegments = backfillControllerAPIs.getAllSegments(tableName);
    List<String> segmentsToDownload = backfillControllerAPIs.findSegmentsInRange(tableName, allSegments,
            startTime, endTime);
    for (String segmentName : segmentsToDownload) {
        backfillControllerAPIs.downloadSegment(segmentName, downloadDir);
    }

    LOGGER.info("Reading downloaded segment input files");
    List<FileStatus> inputDataFiles = new ArrayList<>();
    inputDataFiles.addAll(Lists.newArrayList(fs.listStatus(downloadDir)));
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        LOGGER.info("Creating input files at {} for segment input files", inputDir);
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((inputDir + "/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(BackfillPhaseMapJob.BackfillMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);

    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Cleanup the working directory");
    LOGGER.info("Deleting the dir: {}", downloadDir);
    fs.delete(downloadDir, true);
    LOGGER.info("Deleting the dir: {}", inputDir);
    fs.delete(inputDir, true);
    LOGGER.info("Deleting the dir: {}", outputDir);
    fs.delete(outputDir, true);

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationPhaseJob.class);
    job.setJobName(name);/*from   w w  w .ja va2  s .c  om*/

    FileSystem fs = FileSystem.get(getConf());

    Configuration configuration = job.getConfiguration();

    String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH);
    LOGGER.info("Input path : {}", inputSegmentDir);
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputSegmentDir);
    LOGGER.info("Schema : {}", avroSchema);
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
    String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputDir);
    Path stagingDir = new Path(outputDir, TEMP);
    LOGGER.info("Staging dir : {}", stagingDir);
    String segmentWallClockStart = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_START_TIME);
    LOGGER.info("Segment wallclock start time : {}", segmentWallClockStart);
    String segmentWallClockEnd = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_END_TIME);
    LOGGER.info("Segment wallclock end time : {}", segmentWallClockEnd);
    String schedule = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEDULE);
    LOGGER.info("Segment schedule : {}", schedule);
    String isBackfill = props.getProperty(SEGMENT_CREATION_BACKFILL.toString(), DEFAULT_BACKFILL);
    configuration.set(SEGMENT_CREATION_BACKFILL.toString(), isBackfill);
    LOGGER.info("Is Backfill : {}", configuration.get(SEGMENT_CREATION_BACKFILL.toString()));

    // Create temporary directory
    if (fs.exists(stagingDir)) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(stagingDir, true);
    }
    fs.mkdirs(stagingDir);
    fs.mkdirs(new Path(stagingDir + "/input/"));

    // Create output directory
    if (fs.exists(new Path(outputDir))) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(new Path(outputDir), true);
    }
    fs.mkdirs(new Path(outputDir));

    // Read input files
    List<FileStatus> inputDataFiles = new ArrayList<>();
    for (String input : inputSegmentDir.split(",")) {
        Path inputPathPattern = new Path(input);
        inputDataFiles.addAll(Arrays.asList(fs.listStatus(inputPathPattern)));
    }
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((stagingDir + "/input/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set(SEGMENT_CREATION_THIRDEYE_CONFIG.toString(),
            OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", stagingDir);
    fs.delete(stagingDir, true);

    return job;
}