org.apache.nutch.crawl.InjectorJob.java Source code

Introduction

Here is the source code for org.apache.nutch.crawl.InjectorJob.java
Source

/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.apache.nutch.crawl;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraOutputFormat;
import org.apache.gora.persistency.Persistent;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.atexpats.common.AtexpatsConstants;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.ToolUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** This class takes a flat file of URLs and adds them to the of pages to be
 * crawled.  Useful for bootstrapping the system.
 * The URL files contain one URL per line, optionally followed by custom metadata
 * separated by tabs with the metadata key separated from the corresponding value by '='. <br>
 * Note that some metadata keys are reserved : <br>
 * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
 * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
 * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
 **/
public class InjectorJob extends NutchTool implements Tool {

    public static final Logger LOG = LoggerFactory.getLogger(InjectorJob.class);

    private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

    private static final Utf8 YES_STRING = new Utf8("y");

    static {
        FIELDS.add(WebPage.Field.MARKERS);
        FIELDS.add(WebPage.Field.STATUS);
    }

    /** metadata key reserved for setting a custom score for a specific URL */
    public static String nutchScoreMDName = "nutch.score";
    /**
     * metadata key reserved for setting a custom fetchInterval for a specific URL
     */
    public static String nutchFetchIntervalMDName = "nutch.fetchInterval";

    /**
     * map stored ids with same prefix(domain)
     */
    public static Map<Integer, List<Integer>> multiMap = new HashMap<Integer, List<Integer>>();

    public static class UrlMapper extends Mapper<LongWritable, Text, String, WebPage> {
        private URLNormalizers urlNormalizers;
        private int interval;
        private float scoreInjected;
        private URLFilters filters;
        private ScoringFilters scfilters;
        private long curTime;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            urlNormalizers = new URLNormalizers(context.getConfiguration(), URLNormalizers.SCOPE_INJECT);
            interval = context.getConfiguration().getInt("db.fetch.interval.default", 2592000);
            filters = new URLFilters(context.getConfiguration());
            scfilters = new ScoringFilters(context.getConfiguration());
            scoreInjected = context.getConfiguration().getFloat("db.score.injected", 1.0f);
            curTime = context.getConfiguration().getLong("injector.current.time", System.currentTimeMillis());
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String url = value.toString(); // value is line of text

            // TODO
            Integer idFile = null;
            String status = AtexpatsConstants.STATUS_NEW;
            String[] arrUrl = value.toString().split("##");
            if (arrUrl.length > 0) {
                url = arrUrl[0];
                if (arrUrl.length > 1) {
                    try {
                        idFile = Integer.parseInt(arrUrl[1].trim());
                    } catch (Exception e) {
                        idFile = null;
                    }
                }

                if (arrUrl.length > 2) {
                    status = arrUrl[2].trim();
                }
            }

            if (url != null && url.trim().startsWith("#")) {
                /* Ignore line that start with # */
                return;
            }

            // if tabs : metadata that could be stored
            // must be name=value and separated by \t
            float customScore = -1f;
            int customInterval = interval;
            Map<String, String> metadata = new TreeMap<String, String>();

            if (idFile != null) {
                metadata.put("urlId", String.valueOf(idFile));
                if (multiMap != null && multiMap.containsKey(idFile)) {
                    if (multiMap.get(idFile) != null && multiMap.get(idFile).size() > 0) {
                        metadata.put("sameUrlId",
                                org.apache.commons.lang.StringUtils.join(multiMap.get(idFile).toArray(), ","));
                    }
                }
            }

            // put url status to metadata
            metadata.put("urlStatus", status);

            if (url.indexOf("\t") != -1) {
                String[] splits = url.split("\t");
                url = splits[0];
                for (int s = 1; s < splits.length; s++) {
                    // find separation between name and value
                    int indexEquals = splits[s].indexOf("=");
                    if (indexEquals == -1) {
                        // skip anything without a =
                        continue;
                    }
                    String metaname = splits[s].substring(0, indexEquals);
                    String metavalue = splits[s].substring(indexEquals + 1);
                    if (metaname.equals(nutchScoreMDName)) {
                        try {
                            customScore = Float.parseFloat(metavalue);
                        } catch (NumberFormatException nfe) {
                        }
                    } else if (metaname.equals(nutchFetchIntervalMDName)) {
                        try {
                            customInterval = Integer.parseInt(metavalue);
                        } catch (NumberFormatException nfe) {
                        }
                    } else
                        metadata.put(metaname, metavalue);
                }
            }
            try {
                url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
                url = filters.filter(url); // filter the url
            } catch (Exception e) {
                LOG.warn("Skipping " + url + ":" + e);
                url = null;
            }
            if (url == null) {
                context.getCounter("injector", "urls_filtered").increment(1);
                return;
            } else { // if it passes
                String reversedUrl = TableUtil.reverseUrl(url); // collect it
                WebPage row = new WebPage();

                // save prefix
                row.setPrefix(new Utf8(status));

                row.setFetchTime(curTime);
                row.setFetchInterval(customInterval);

                // now add the metadata
                Iterator<String> keysIter = metadata.keySet().iterator();
                while (keysIter.hasNext()) {
                    String keymd = keysIter.next();
                    String valuemd = metadata.get(keymd);
                    row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
                }

                if (customScore != -1)
                    row.setScore(customScore);
                else
                    row.setScore(scoreInjected);

                try {
                    scfilters.injectedScore(url, row);
                } catch (ScoringFilterException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Cannot filter injected score for url " + url + ", using default ("
                                + e.getMessage() + ")");
                    }
                }
                context.getCounter("injector", "urls_injected").increment(1);
                row.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
                Mark.INJECT_MARK.putMark(row, YES_STRING);
                context.write(reversedUrl, row);
            }
        }
    }

    public InjectorJob() {
    }

    public InjectorJob(Configuration conf) {
        setConf(conf);
    }

    @Override
    public Map<String, Object> run(Map<String, Object> args) throws Exception {
        getConf().setLong("injector.current.time", System.currentTimeMillis());
        Path input;
        Object path = args.get(Nutch.ARG_SEEDDIR);
        if (path instanceof Path) {
            input = (Path) path;
        } else {
            input = new Path(path.toString());
        }

        numJobs = 1;
        currentJobNum = 0;
        currentJob = new NutchJob(getConf(), "inject " + input);

        // TODO
        Map<String, Integer> mapUrl = new HashMap<String, Integer>();
        Map<Integer, String> invertMapUrl = new HashMap<Integer, String>();
        Map<String, String> mapStatus = new HashMap<String, String>();
        /*mapUrl.put(
        123, 
        "http://lottecinemavn.com/en-us/default.aspx"
        );
        mapUrl.put(
        456, 
        "http://lottecinemavn.com/vi-vn/default.aspx"
        );
        mapUrl.put(
        123, 
        "http://lottecinemavn.com/vi-vn/phim/biet-oi-chim-canh-cut-vung-madagascar-(penguins-i.aspx"
        );
        mapUrl.put(
        456, 
        "http://lottecinemavn.com/en-us/phim/ke-san-tin-en-(nightcrawler).aspx"
        );
        */
        /*mapUrl.put(
        456, 
        "http://www.entertainmentone.com/home"
        );
        mapUrl.put(
        456, 
        "http://www.warnerbros.com/"
        );
        */
        /*mapUrl.put(
        "https://www.galaxycine.vn/vi/thong-tin-phim/big-hero",
        456
        );
        mapUrl.put(
        "http://lottecinemavn.com/vi-vn/phim/biet-oi-big-hero-6.aspx",
        123
        );*/
        /*mapUrl.put(
        "https://www.thegioididong.com/dtdd/asus-zenfone-4-45",
        123
        );*/

        /*mapUrl.put(
        "http://disney.com",
        123
        );*/
        /*invertMapUrl.put(
        123,
        "http://www.entertainmentone.com/home"
        );
            
        invertMapUrl.put(
        456,
        "http://www.entertainmentone.com/home"
        );
            
        invertMapUrl.put(
        789,
        "http://www.entertainmentone.com/home"
        );*/

        /*invertMapUrl.put(
        789,
        "http://korean.alibaba.com"
        );*/

        /*      invertMapUrl.put(
        123,
        //"NE;https://www.galaxycine.vn/vi/thong-tin-phim/DORAEMON-CG"
        //"NE;http://www.funnyland.vn/admin"
        "www.ebay.com.tw"
        );
              */
        /*mapUrl.put(
        "http://ch-hotelfurniture.en.made-in-china.com/product/KXJEDaVcHtkw/China-Hotel-Bedroom-Furniture-Luxury-Double-Bedroom-Furniture-Standard-Hotel-Double-Bedroom-Suite-Double-Hospitality-Guest-Room-Furniture-CHN-011-.html",
        123
        );*/

        /*HibernateUtils hibernateUtils = HibernateUtils.getInstance();
        Session session = hibernateUtils.openSession();
        Transaction transaction = session.beginTransaction();
        try{
           // web search
           //mapUrl = hibernateUtils.getListingWebsite();
           //invertMapUrl = hibernateUtils.getListingWebsiteV2();
               
           // movies
           //mapUrl = hibernateUtils.getListingCinema();
           //invertMapUrl = hibernateUtils.getListingCinemaV2();
               
           transaction.commit();
        } catch (Exception e){
           session.close();
        }*/

        // web search
        URL url;
        String prefix, value, status = "NE";
        multiMap = new HashMap<Integer, List<Integer>>();
        Map<String, Integer> prefixMap = new HashMap<String, Integer>();
        for (Map.Entry<Integer, String> entry : invertMapUrl.entrySet()) {
            value = entry.getValue();
            if (value.indexOf("NE;") == 0 || value.indexOf("UP;") == 0 || value.indexOf("DE;") == 0) {
                status = value.substring(0, 2);
                value = value.substring(3);
            }

            if (org.apache.commons.lang.StringUtils.isBlank(value)) {
                continue;
            }
            if (value != null && value.indexOf("http") != 0) {
                value = "http://" + value;
            }
            try {
                url = new URL(value);
            } catch (MalformedURLException e) {
                LOG.error("", e);
                continue;
            }

            prefix = url.getHost() + url.getPath();
            if (!prefixMap.containsKey(prefix)) {
                prefixMap.put(prefix, entry.getKey());
            } else {
                if (!multiMap.containsKey(prefixMap.get(prefix))) {
                    multiMap.put(prefixMap.get(prefix), new ArrayList<Integer>());
                }
                multiMap.get(prefixMap.get(prefix)).add(entry.getKey());
            }
            if (!mapUrl.containsKey(value)) {
                mapUrl.put(value, entry.getKey());
            }

            if (!mapStatus.containsKey(value)) {
                mapStatus.put(value, status);
            }

        }

        String dirStr = currentJob.getWorkingDirectory().toUri().getPath();
        LOG.info("Working directory: " + dirStr);

        dirStr = dirStr + "/" + input;
        //dirStr = dirStr.substring(1) + "/" + input;

        File dirFile = new File(dirStr);
        if (dirFile.exists()) {
            //FileUtils.cleanDirectory(dirFile);
        } else {
            dirFile.mkdir();
        }
        LOG.info("Current directory: " + dirStr);
        String fileStr = dirStr + "/" + "url.txt";
        LOG.info("File name: " + fileStr);
        FileOutputStream file = new FileOutputStream(fileStr);
        Writer bw = new BufferedWriter(new OutputStreamWriter(file, "UTF8"));
        String newLine = System.getProperty("line.separator");

        String key;
        for (Map.Entry<String, Integer> entry : mapUrl.entrySet()) {
            key = entry.getKey();
            if (key != null && key.indexOf("http") != 0) {
                key = "http://" + key;
            }
            bw.append(key + " ## " + entry.getValue() + " ## " + mapStatus.get(key) + newLine);
            LOG.info("Append -- Key : " + key + " Value : " + entry.getValue());
        }

        bw.flush();
        bw.close();

        FileInputFormat.addInputPath(currentJob, input);
        currentJob.setMapperClass(UrlMapper.class);
        currentJob.setMapOutputKeyClass(String.class);
        currentJob.setMapOutputValueClass(WebPage.class);
        currentJob.setOutputFormatClass(GoraOutputFormat.class);

        DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob.getConfiguration(), String.class,
                WebPage.class);
        GoraOutputFormat.setOutput(currentJob, store, true);

        //TODO LOG
        LOG.info("CONFIG:  " + currentJob.getConfiguration().toString());

        // NUTCH-1471 Make explicit which datastore class we use
        Class<? extends DataStore<Object, Persistent>> dataStoreClass = StorageUtils
                .getDataStoreClass(currentJob.getConfiguration());
        LOG.info("InjectorJob: Using " + dataStoreClass + " as the Gora storage class.");

        currentJob.setReducerClass(Reducer.class);
        currentJob.setNumReduceTasks(0);

        currentJob.waitForCompletion(true);
        ToolUtil.recordJobStatus(null, currentJob, results);

        // NUTCH-1370 Make explicit #URLs injected @runtime
        long urlsInjected = currentJob.getCounters().findCounter("injector", "urls_injected").getValue();
        long urlsFiltered = currentJob.getCounters().findCounter("injector", "urls_filtered").getValue();
        LOG.info("InjectorJob: total number of urls rejected by filters: " + urlsFiltered);
        LOG.info("InjectorJob: total number of urls injected after normalization and filtering: " + urlsInjected);

        return results;
    }

    public void inject(Path urlDir) throws Exception {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("InjectorJob: starting at " + sdf.format(start));
        LOG.info("InjectorJob: Injecting urlDir: " + urlDir);
        run(ToolUtil.toArgMap(Nutch.ARG_SEEDDIR, urlDir));
        long end = System.currentTimeMillis();
        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    @Override
    public int run(String[] args) throws Exception {
        if (args.length < 1) {
            System.err.println("Usage: InjectorJob <url_dir> [-crawlId <id>]");
            return -1;
        }
        for (int i = 1; i < args.length; i++) {
            if ("-crawlId".equals(args[i])) {
                getConf().set(Nutch.CRAWL_ID_KEY, args[i + 1]);
                i++;
            } else {
                System.err.println("Unrecognized arg " + args[i]);
                return -1;
            }
        }

        try {
            // seed dir always 'urls'
            if (!"urls".equals(args[0])) {
                args[0] = "urls";
            }
            inject(new Path(args[0]));
            return -0;
        } catch (Exception e) {
            LOG.error("InjectorJob: " + StringUtils.stringifyException(e));
            return -1;
        }
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new InjectorJob(), args);
        System.exit(res);
    }
}