babel.prep.extract.PageExtReducer.java Source code

Introduction

Here is the source code for babel.prep.extract.PageExtReducer.java
Source

/**
 * This file is licensed to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package babel.prep.extract;

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;

import babel.content.pages.Page;

/**
 * Constructs Pages comprised of PageVersions from chunks returned for a URL.
 */
class PageExtReducer extends MapReduceBase implements Reducer<Text, NutchChunk, Text, Page> {
    public void reduce(Text key, Iterator<NutchChunk> values, OutputCollector<Text, Page> output, Reporter reporter)
            throws IOException {
        // Create a new page (potentially containing multiple versions)
        Page page = new Page(key.toString(), values);
        int numVersions = page.numVersions();

        // Only care about it if we have at least one version
        if (numVersions > 0 && (page.pageURL().length() > 0))// && isBBCEnglish(page))
        {
            NutchPageExtractor.Stats.incPages();
            NutchPageExtractor.Stats.incVersions(numVersions);

            output.collect(key, page);
        } else {
            NutchPageExtractor.Stats.incIgnoredPages();
        }
    }

    protected boolean isBBCEnglish(Page page) {
        String url = removeProtocolAndPrefix(page.pageURL());
        return (url.matches("^bbc.co.uk/(hi/|low/)?english/.*") || url.matches("^bbc.co.uk/local/.*")
                || url.matches("^bbc.co.uk/[12]/.*"));
    }

    protected String removeProtocolAndPrefix(String url) {
        // Strip everything up to first dot, and lowercase
        return url.substring(url.indexOf(".") + 1).toLowerCase();
    }
}