org.commoncrawl.util.HDFSUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.util.HDFSUtils.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import com.google.common.collect.ImmutableList;

/**
 * 
 * @author rana
 *
 */
public class HDFSUtils {

    private static final Log LOG = LogFactory.getLog(HDFSUtils.class);

    public static List<String> textFileToList(FileSystem fs, Path path) throws IOException {

        ImmutableList.Builder<String> builder = new ImmutableList.Builder<String>();

        BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(path), Charset.forName("UTF-8")));
        try {
            String line;
            while ((line = reader.readLine()) != null) {
                if (line.length() != 0 && !line.startsWith("#"))
                    builder.add(line);
            }
        } finally {
            reader.close();
        }
        return builder.build();
    }

    public static void listToTextFile(List<String> lines, FileSystem fs, Path path) throws IOException {
        Writer writer = new OutputStreamWriter(fs.create(path), Charset.forName("UTF-8"));
        try {
            for (String line : lines) {
                writer.write(line);
                writer.append("\n");
            }
            writer.flush();
        } finally {
            writer.close();
        }
    }

    public static long findLatestDatabaseTimestamp(FileSystem fs, Path rootPath) throws IOException {

        FileStatus candidates[] = fs.globStatus(new Path(rootPath, "*"));

        long candidateTimestamp = -1L;

        for (FileStatus candidate : candidates) {
            LOG.info("Found Seed Candidate:" + candidate.getPath());
            try {
                long timestamp = Long.parseLong(candidate.getPath().getName());
                if (candidateTimestamp == -1 || candidateTimestamp < timestamp) {
                    candidateTimestamp = timestamp;

                }
            } catch (Exception e) {
                LOG.error("Invalid Path:" + candidate.getPath());
            }
        }
        LOG.info("Selected Candidate is:" + candidateTimestamp);
        return candidateTimestamp;
    }
}