org.apache.nutch.mapreduce.NutchCounter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.nutch.mapreduce.NutchCounter.java

Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.mapreduce;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.EnumUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.nutch.crawl.filters.CrawlFilter.PageType;
import org.apache.nutch.crawl.filters.CrawlFilters;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.NetUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

public class NutchCounter {

    public Logger LOG;

    // Default Counters
    public static enum Counter {
        totalPages, indexPages, detailPages
    }

    private static AtomicInteger counterSequence = new AtomicInteger(0);

    @SuppressWarnings("rawtypes")
    protected final TaskInputOutputContext context;

    protected final Configuration conf;

    private final int id;

    private final String name;

    private final CrawlFilters crawlFilters;

    private String hostname;

    private final String counterGroup = Nutch.COUNTER_GROUP_STATUS;

    // Thread safe for read
    private AtomicInteger countersCount = new AtomicInteger(0);
    // Thread safe for read/write at index
    private ArrayList<String> counterNames = Lists.newArrayList();
    // Thread safe for read/write at index
    private ArrayList<AtomicInteger> globalCounters = Lists.newArrayList();
    // Thread safe for read/write at index
    private ArrayList<AtomicInteger> nativeCounters = Lists.newArrayList();
    // Not thread safe
    private Map<String, Integer> counterIndexes = Maps.newHashMap();

    @SuppressWarnings("rawtypes")
    public NutchCounter(TaskInputOutputContext context) {
        this.context = context;
        this.conf = context.getConfiguration();
        this.id = counterSequence.incrementAndGet();
        this.name = "NutchCounter" + "-" + id;

        String jobName = context.getJobName();
        jobName = StringUtils.substringBeforeLast(jobName, "-");
        jobName = jobName.replaceAll("(\\[.+\\])", "");
        this.LOG = LoggerFactory.getLogger(name + "-" + jobName);

        this.hostname = NetUtil.getHostname();

        crawlFilters = CrawlFilters.create(conf);
    }

    public static int counterSequence() {
        return counterSequence.get();
    }

    public <T extends Enum<T>> void register(Class<T> counterClass) {
        register(EnumUtils.getEnumMap(counterClass).keySet());
    }

    public final int id() {
        return id;
    }

    public final String name() {
        return name;
    }

    public final String getHostname() {
        return hostname;
    }

    @SuppressWarnings("rawtypes")
    public final TaskInputOutputContext getContext() {
        return context;
    }

    public void increase(Enum<?> counter) {
        increase(counter.ordinal());
    }

    public void increase(Enum<?> counter, int value) {
        increase(counter.ordinal(), value);
    }

    public void setValue(Enum<?> counter, int value) {
        setValue(counter.ordinal(), value);
    }

    public int getIndex(Counter counter) {
        return countersCount.get() - Counter.values().length + counter.ordinal();
    }

    public void updateAffectedRows(String url) throws IOException {
        // Counters
        PageType pageType = crawlFilters.getPageType(url);
        if (pageType.equals(PageType.DETAIL)) {
            increase(getIndex(Counter.detailPages));
        } else if (pageType.equals(PageType.INDEX)) {
            increase(getIndex(Counter.indexPages));
        }

        increase(getIndex(Counter.totalPages));
    }

    public int get(String name) {
        return get(counterIndexes.get(name));
    }

    public int get(int index) {
        if (!validate(index))
            return -1;

        return nativeCounters.get(index).get();
    }

    public int get(Enum<?> counter) {
        return get(counter.ordinal());
    }

    public String getStatusString(String... names) {
        StringBuilder sb = new StringBuilder();

        for (int i = 0; i < countersCount.get(); ++i) {
            String name = counterNames.get(i);

            if (ArrayUtils.isEmpty(names) || ArrayUtils.contains(names, name)) {
                int value = nativeCounters.get(i).get();

                if (value != 0) {
                    sb.append(name).append(" : ").append(value).append('\t');
                }
            }
        }

        return sb.toString();
    }

    public String getStatusString() {
        return getStatusString(ArrayUtils.EMPTY_STRING_ARRAY);
    }

    public void logStatus() {
        String status = getStatusString();

        if (!status.isEmpty()) {
            LOG.info(getStatusString());
        } else {
            LOG.info("nothing counted");
        }
    }

    public void accumulateGlobalCounters() {
        for (int i = 0; i < countersCount.get(); ++i) {
            String name = counterNames.get(i);
            int value = globalCounters.get(i).getAndSet(0);

            if (value != 0) {
                // LOG.debug("global : " + name + " : " + value);
                context.getCounter(counterGroup, name).increment(value);
            }
        }
    }

    protected void increase(int index) {
        if (!validate(index))
            return;

        globalCounters.get(index).incrementAndGet();
        nativeCounters.get(index).incrementAndGet();

        //    LOG.info("#" + index + " : " + nativeCounters.get(index).get());
    }

    protected void increase(int index, int value) {
        if (!validate(index))
            return;

        globalCounters.get(index).addAndGet(value);
        nativeCounters.get(index).addAndGet(value);
    }

    protected void increaseAll(int... indexes) {
        for (int index : indexes) {
            increase(index);
        }
    }

    protected void setValue(int index, int value) {
        if (!validate(index))
            return;

        globalCounters.get(index).set(value);
        nativeCounters.get(index).set(value);

        // LOG.info("#" + index + " : " + nativeCounters.get(index).get());
    }

    protected void register(Collection<String> counters) {
        if (countersCount.get() != 0) {
            LOG.warn("already registered");
            return;
        }

        ArrayList<String> newCounters = Lists.newArrayList();
        newCounters.addAll(counters);
        newCounters.addAll(EnumUtils.getEnumMap(Counter.class).keySet());

        registerCounters(newCounters);
    }

    private void registerCounters(ArrayList<String> names) {
        countersCount.set(names.size());

        for (int i = 0; i < countersCount.get(); ++i) {
            counterNames.add(names.get(i));
            counterIndexes.put(names.get(i), i);
            globalCounters.add(new AtomicInteger(0));
            nativeCounters.add(new AtomicInteger(0));
        }

        if (countersCount.get() == 0) {
            LOG.warn("No counters, will not run report thread");
        }

        Validate.isTrue(counterIndexes.size() == counterNames.size());
        Validate.isTrue(counterIndexes.size() == countersCount.get());
        Validate.isTrue(CollectionUtils.containsAll(counterIndexes.keySet(), counterNames));

        LOG.info("Registered counters : " + StringUtils.join(MapUtils.invertMap(counterIndexes).entrySet(), ", "));
    }

    private boolean validate(int index) {
        if (index < 0 || index >= countersCount.get()) {
            LOG.error("Invalid index #" + index);
            return false;
        }

        return true;
    }
}