com.chimpler.example.bayes.DataToSeq.java Source code

Java tutorial

Introduction

Here is the source code for com.chimpler.example.bayes.DataToSeq.java

Source

/*
 * Copyright (c) 2010 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.chimpler.example.bayes;

import java.io.BufferedReader;
import java.io.FileReader;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.Writer;
import org.apache.hadoop.io.Text;

/**
 * http://www.chimpler.com
 */
public class DataToSeq {
    public static void main(String args[]) throws Exception {
        if (args.length != 2) {
            System.err.println("Arguments: [input tsv file] [output sequence file]");
            return;
        }
        String inputFileName = args[0];
        String outputDirName = args[1];
        Configuration configuration = new Configuration();
        FileSystem fs = FileSystem.get(configuration);
        Writer writer = new SequenceFile.Writer(fs, configuration, new Path(outputDirName + "/chunk-0"), Text.class,
                Text.class);

        int count = 0;
        BufferedReader reader = new BufferedReader(new FileReader(inputFileName));
        Text key = new Text();
        Text value = new Text();
        while (true) {
            String line = reader.readLine();
            if (line == null) {
                break;
            }
            String[] tokens = line.split("\t", 2);
            if (tokens.length != 2) {
                System.out.println("Skip line: " + line);
                continue;
            }
            String category = tokens[0];
            //   String id = tokens[1];
            String message = tokens[1];
            key.set("/" + category + "/");
            value.set(message);
            writer.append(key, value);
            count++;
        }
        reader.close();
        writer.close();
        System.out.println("Wrote " + count + " entries.");
    }
}