org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.service.listcrawler.ProxyPurgeUtils.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.service.listcrawler;

import java.io.IOException;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.commoncrawl.util.time.Month;
import org.commoncrawl.util.time.SerialDate;
import org.commoncrawl.util.Tuples.Pair;

import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;

/** 
 * Utility code 
 * 
 * @author rana
 *
 */
public class ProxyPurgeUtils {

    public static void main(String[] args) throws IOException {

        Configuration conf = new Configuration();

        conf.addResource("nutch-default.xml");
        conf.addResource("nutch-site.xml");

        if (args[0].equalsIgnoreCase("list")) {
            listCandidates(conf, caculateCutOffMillisecond());
        }
    }

    static long caculateCutOffMillisecond() {
        // convert it to a serial date 
        SerialDate today = SerialDate.createInstance(new Date(System.currentTimeMillis()));
        // get date 4 months back 
        SerialDate cutOffDate = SerialDate.addDays(-120, today);
        // convert to month 
        Month cutOffMonth = new Month(cutOffDate.toDate());
        // get first cut off millisecond
        long cutOffMillisecond = cutOffMonth.getFirstMillisecond();

        return cutOffMillisecond;
    }

    static class Range extends Pair<Long, Long> implements Comparable<Pair<Long, Long>> {

        public Range(Long e0, Long e1) {
            super(e0, e1);
        }

        @Override
        public int compareTo(Pair<Long, Long> o2) {
            return (e0 < o2.e0) ? -1 : (e0 > o2.e0) ? 1 : 0;
        }

    }

    //1312941988792
    static void listCandidates(Configuration conf, final long cutOffTimeMillisecond) throws IOException {
        FileSystem fs = FileSystem.get(conf);
        FileSystem localFS = FileSystem.getLocal(conf);

        final Multimap<Long, Range> rangeMap = TreeMultimap.create();
        FileStatus candidateDirs[] = fs.globStatus(new Path("crawl/proxy/cacheExport/processed/*"));

        for (FileStatus candidate : candidateDirs) {
            String fileName = candidate.getPath().getName();
            // get scaled timestamp start 
            long timestampStart = Long.parseLong(fileName) * 1000000000;
            // ok see if exceeds our cutoff time 
            if (timestampStart < cutOffTimeMillisecond) {
                FileStatus ranges[] = fs.globStatus(new Path(candidate.getPath(), "*"));
                for (FileStatus range : ranges) {
                    String rangeName = range.getPath().getName();
                    long rangeStart = Long.parseLong(rangeName.substring(0, rangeName.indexOf("-")));
                    long rangeEnd = Long.parseLong(rangeName.substring(rangeName.indexOf("-") + 1));

                    rangeMap.put(Long.parseLong(fileName), new Range(rangeStart, rangeEnd));
                }
            }
        }

        PathFilter cacheDataFilter = new PathFilter() {

            @Override
            public boolean accept(Path path) {
                if (path.getName().startsWith("cacheData-") || path.getName().startsWith("cacheIndex-")) {
                    long timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                    long timestampPrefix = timestamp / 1000000000L;
                    //System.out.println("timestamp:" + timestamp + " prefix:" + timestampPrefix);
                    for (Range range : rangeMap.get(timestampPrefix)) {
                        if (timestamp >= range.e0 && timestamp <= range.e1) {
                            return true;
                        }
                    }
                }
                return false;
            }
        };

        PathFilter historyDataFilter = new PathFilter() {

            @Override
            public boolean accept(Path path) {
                if (path.getName().startsWith("historyData-") || path.getName().startsWith("historyBloomFilter-")) {
                    int indexOfDot = path.getName().indexOf(".");
                    long timestamp = -1L;
                    if (indexOfDot != -1) {
                        timestamp = Long
                                .parseLong(path.getName().substring(path.getName().indexOf("-") + 1, indexOfDot));
                    } else {
                        timestamp = Long.parseLong(path.getName().substring(path.getName().indexOf("-") + 1));
                    }

                    if (timestamp < cutOffTimeMillisecond) {
                        return true;
                    }
                }
                return false;
            }
        };

        FileStatus purgeCandidates[] = fs.globStatus(new Path("crawl/proxy/cache/*"), cacheDataFilter);

        for (FileStatus candidate : purgeCandidates) {
            System.out.println("Purging Candidate:" + candidate.getPath());
            fs.delete(candidate.getPath(), false);
        }

        FileStatus localcacheDataPurgeCandidates[] = localFS
                .globStatus(new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/*"), cacheDataFilter);

        for (FileStatus candidate : localcacheDataPurgeCandidates) {
            System.out.println("Purging Candidate:" + candidate.getPath());
            localFS.delete(candidate.getPath(), false);
        }

        // now delete bloom filter data
        FileStatus historyPurgeCandidates[] = fs.globStatus(new Path("crawl/proxy/history/*"), historyDataFilter);

        for (FileStatus candidate : historyPurgeCandidates) {
            System.out.println("Purging Candidate:" + candidate.getPath());
            fs.delete(candidate.getPath(), true);
        }

        // now delete bloom filter data
        FileStatus localHistoryPurgeCandidates[] = localFS.globStatus(
                new Path("/home/rana/ccprod/data/proxy_data/ccn01-Prod/historyData/*"), historyDataFilter);

        for (FileStatus candidate : historyPurgeCandidates) {
            System.out.println("Purging Candidate:" + candidate.getPath());
            fs.delete(candidate.getPath(), true);
        }

        for (FileStatus candidate : localHistoryPurgeCandidates) {
            System.out.println("Purging Candidate:" + candidate.getPath());
            localFS.delete(candidate.getPath(), true);
        }

    }
}