org.apache.nutch.store.readable.StoreReadable.java Source code

Introduction

Here is the source code for org.apache.nutch.store.readable.StoreReadable.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * <p/>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p/>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.store.readable;

import org.apache.avro.util.Utf8;
import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.*;
import org.apache.nutch.parse.html.DOMBuilder;
import org.apache.nutch.parse.html.DOMContentUtils;
import org.apache.nutch.parse.html.HTMLMetaProcessor;
import org.apache.nutch.parse.html.HtmlParser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.store.readable.utils.JsoupTokopedia;
import org.apache.nutch.store.readable.utils.StorageUtilsPlugin;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.EncodingDetector;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TableUtil;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.json.JSONException;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class StoreReadable extends HtmlParser {
    public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.store.readable.StoreReadable");

    // I used 1000 bytes at first, but  found that some documents have
    // meta tag well past the first 1000 bytes.
    // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
    private static final int CHUNK_SIZE = 2000;

    // NUTCH-1006 Meta equiv with single quotes not accepted
    private static Pattern metaPattern = Pattern
            .compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", Pattern.CASE_INSENSITIVE);
    private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
            Pattern.CASE_INSENSITIVE);
    private static Pattern charsetPatternHTML5 = Pattern
            .compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", Pattern.CASE_INSENSITIVE);

    private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

    static {
        FIELDS.add(WebPage.Field.BASE_URL);
        FIELDS.add(WebPage.Field.READABLE);
    }

    private String parserImpl;

    /**
     * Given a <code>ByteBuffer</code> representing an html file of an
     * <em>unknown</em> encoding,  read out 'charset' parameter in the meta tag
     * from the first <code>CHUNK_SIZE</code> bytes.
     * If there's no meta tag for Content-Type or no charset is specified,
     * the content is checked for a Unicode Byte Order Mark (BOM).
     * This will also cover non-byte oriented character encodings (UTF-16 only).
     * If no character set can be determined,
     * <code>null</code> is returned.  <br />
     * See also http://www.w3.org/International/questions/qa-html-encoding-declarations,
     * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and
     * http://www.w3.org/TR/REC-xml/#sec-guessing
     * <br />
     *
     * @param content <code>ByteBuffer</code> representation of an html file
     */

    private static String sniffCharacterEncoding(ByteBuffer content) {
        System.out.println(
                "[STORE-READABLE]sniffCharacterEncoding----------------------------------------------------------");
        int length = Math.min(content.remaining(), CHUNK_SIZE);

        // We don't care about non-ASCII parts so that it's sufficient
        // to just inflate each byte to a 16-bit value by padding.
        // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
        // {U+0041, U+0082, U+00B7}.
        String str = "";
        try {
            str = new String(content.array(), content.arrayOffset() + content.position(), length,
                    Charset.forName("ASCII").toString());
        } catch (UnsupportedEncodingException e) {
            // code should never come here, but just in case...
            return null;
        }

        Matcher metaMatcher = metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find()) {
            Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
            if (charsetMatcher.find())
                encoding = new String(charsetMatcher.group(1));
        }
        if (encoding == null) {
            // check for HTML5 meta charset
            metaMatcher = charsetPatternHTML5.matcher(str);
            if (metaMatcher.find()) {
                encoding = new String(metaMatcher.group(1));
            }
        }
        if (encoding == null) {
            // check for BOM
            if (length >= 3 && content.get(0) == (byte) 0xEF && content.get(1) == (byte) 0xBB
                    && content.get(2) == (byte) 0xBF) {
                encoding = "UTF-8";
            } else if (length >= 2) {
                if (content.get(0) == (byte) 0xFF && content.get(1) == (byte) 0xFE) {
                    encoding = "UTF-16LE";
                } else if (content.get(0) == (byte) 0xFE && content.get(1) == (byte) 0xFF) {
                    encoding = "UTF-16BE";
                }
            }
        }

        return encoding;
    }

    private String defaultCharEncoding;

    private Configuration conf;

    private DOMContentUtils utils;

    private ParseFilters htmlParseFilters;

    private String cachingPolicy;

    public Parse getParse(String url, WebPage page) {
        HTMLMetaTags metaTags = new HTMLMetaTags();
        System.out.println("[STORE-READABLE]getParse-------------------------------------------------------------");
        String baseUrl = TableUtil.toString(page.getBaseUrl());
        URL base;
        try {
            base = new URL(baseUrl);
        } catch (MalformedURLException e) {
            return ParseStatusUtils.getEmptyParse(e, getConf());
        }

        String text = "";
        String title = "";
        Outlink[] outlinks = new Outlink[0];

        // parse the content
        DocumentFragment root;
        try {
            ByteBuffer contentInOctets = page.getContent();
            InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(),
                    contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining()));

            EncodingDetector detector = new EncodingDetector(conf);
            detector.autoDetectClues(page, true);
            detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
            String encoding = detector.guessEncoding(page, defaultCharEncoding);

            page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING),
                    ByteBuffer.wrap(Bytes.toBytes(encoding)));
            page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION),
                    ByteBuffer.wrap(Bytes.toBytes(encoding)));

            input.setEncoding(encoding);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Parsing...");
            }
            root = parse(input);
        } catch (IOException e) {
            LOG.error("Failed with the following IOException: ", e);
            return ParseStatusUtils.getEmptyParse(e, getConf());
        } catch (DOMException e) {
            LOG.error("Failed with the following DOMException: ", e);
            return ParseStatusUtils.getEmptyParse(e, getConf());
        } catch (SAXException e) {
            LOG.error("Failed with the following SAXException: ", e);
            return ParseStatusUtils.getEmptyParse(e, getConf());
        } catch (Exception e) {
            LOG.error("Failed with the following Exception: ", e);
            return ParseStatusUtils.getEmptyParse(e, getConf());
        }

        // get meta directives
        HTMLMetaProcessor.getMetaTags(metaTags, root, base);
        if (LOG.isTraceEnabled()) {
            LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
        }
        // check meta directives
        if (!metaTags.getNoIndex()) { // okay to index
            StringBuilder sb = new StringBuilder();
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting text...");
            }
            utils.getText(sb, root); // extract text
            text = sb.toString();
            sb.setLength(0);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting title...");
            }
            utils.getTitle(sb, root); // extract title
            title = sb.toString().trim();
        }

        if (!metaTags.getNoFollow()) { // okay to follow links
            ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
            URL baseTag = utils.getBase(root);
            if (LOG.isTraceEnabled()) {
                LOG.trace("Getting links...");
            }
            utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
            outlinks = l.toArray(new Outlink[l.size()]);
            if (LOG.isTraceEnabled()) {
                LOG.trace("found " + outlinks.length + " outlinks in " + url);
            }
        }

        ParseStatus status = ParseStatus.newBuilder().build();
        status.setMajorCode((int) ParseStatusCodes.SUCCESS);
        if (metaTags.getRefresh()) {
            status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT);
            status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString()));
            status.getArgs().add(new Utf8(Integer.toString(metaTags.getRefreshTime())));
        }

        String strJo = addJsonToPage(url, page);

        //        storeJsonToSchema(url, page ,strJo);
        page.setReadable(new Utf8(strJo));

        Parse parse = new Parse(text, title, outlinks, status, strJo);
        parse = htmlParseFilters.filter(url, page, parse, metaTags, root);

        if (metaTags.getNoCache()) { // not okay to cache
            page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY),
                    ByteBuffer.wrap(Bytes.toBytes(cachingPolicy)));
        }
        parse.setJsonRead(strJo);

        return parse;
    }

    private String addJsonToPage(String url, WebPage page) {
        String jsonStr = null;

        //====================the part of convert to json
        Map<CharSequence, ByteBuffer> metadataByte = page.getMetadata();
        Metadata metaData = new Metadata();

        for (CharSequence cs : metadataByte.keySet()) {
            String csString = cs.toString();
            String value = String.valueOf(metadataByte.get(cs).array());

            metaData.add(csString, value);
        }

        Content r = new Content(url, url, page.getContent().array(), page.getContentType().toString(), metaData,
                conf);

        JsoupTokopedia jsT = new JsoupTokopedia();

        try {
            JSONObject jo = jsT.constructJson(r);
            jsonStr = String.valueOf(jo);
        } catch (JSONException e) {
            e.printStackTrace();
        }

        return jsonStr;
    }

    private DocumentFragment parse(InputSource input) throws Exception {
        System.out.println("[STORE-READABLE]---------------------------------------------------parse");
        if (parserImpl.equalsIgnoreCase("tagsoup"))
            return parseTagSoup(input);
        else
            return parseNeko(input);
    }

    private DocumentFragment parseTagSoup(InputSource input) throws Exception {
        System.out.println("[STORE-READABLE]---------------------------------------------------parseTagSoup");
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        DocumentFragment frag = doc.createDocumentFragment();
        DOMBuilder builder = new DOMBuilder(doc, frag);
        org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser();
        reader.setContentHandler(builder);
        reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);
        reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false);
        reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder);
        reader.parse(input);
        return frag;
    }

    private DocumentFragment parseNeko(InputSource input) throws Exception {
        System.out.println("[STORE-READABLE]----------------------------------------------------parseNeko");
        DOMFragmentParser parser = new DOMFragmentParser();
        try {
            parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true);
            parser.setFeature("http://cyberneko.org/html/features/augmentations", true);
            parser.setProperty("http://cyberneko.org/html/properties/default-encoding", defaultCharEncoding);
            parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
            parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
            parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
            parser.setFeature("http://cyberneko.org/html/features/report-errors", LOG.isTraceEnabled());
        } catch (SAXException e) {
        }
        // convert Document to DocumentFragment
        HTMLDocumentImpl doc = new HTMLDocumentImpl();
        doc.setErrorChecking(false);
        DocumentFragment res = doc.createDocumentFragment();
        DocumentFragment frag = doc.createDocumentFragment();
        parser.parse(input, frag);
        res.appendChild(frag);

        try {
            while (true) {
                frag = doc.createDocumentFragment();
                parser.parse(input, frag);
                if (!frag.hasChildNodes())
                    break;
                if (LOG.isInfoEnabled()) {
                    LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes.");
                }
                res.appendChild(frag);
            }
        } catch (Exception x) {
            LOG.error("Failed with the following Exception: ", x);
        }
        ;
        return res;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
        this.htmlParseFilters = new ParseFilters(getConf());
        this.parserImpl = getConf().get("parser.html.impl", "neko");
        this.defaultCharEncoding = getConf().get("parser.character.encoding.default", "windows-1252");
        this.utils = new DOMContentUtils(conf);
        this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT);
    }

    public Configuration getConf() {
        return this.conf;
    }

    public void storeJsonToSchema(String url, WebPage page, String jsonPar) {

        //        try {
        //
        //            String rowKey=TableUtil.reverseUrl(url);
        //            DataStore<String, WebPage> store = StorageUtilsPlugin.createWebStore(getConf(),
        //                    String.class, WebPage.class, null);
        //
        //            page.setReadable(new Utf8(jsonPar));
        //            if(jsonPar.length()>2) {
        //                store.put(rowKey, page);
        //            }
        //            store.execute(store.newQuery());
        //
        //            store.flush();
        //            store.close();
        //
        //        } catch (ClassNotFoundException e) {
        //            e.printStackTrace();
        //        } catch (GoraException e) {
        //            e.printStackTrace();
        //        } catch (MalformedURLException e) {
        //            e.printStackTrace();
        //        }
    }

    @Override
    public Collection<WebPage.Field> getFields() {
        return FIELDS;
    }

    public static void main(String[] args) throws Exception {

        String rowKey = "com.tokopedia.www:https/xiaomi-mobile/xiaomi-yi-action-camera-paket-komplit-16gb";
        String id = "webpage";
        String schemaPrefix = "TestCrawl";
        Configuration localConf = NutchConfiguration.create();

        DataStore<String, WebPage> store = StorageUtilsPlugin.createWebStore(localConf, String.class, WebPage.class,
                schemaPrefix);

        //
        WebPage camera = store.get(rowKey);

        //    LOG.setLevel(Level.FINE);
        //    String name = args[0];
        String url = String.valueOf(camera.getBaseUrl());//"file:"+name;
        //    File file = new File(name);
        //    byte[] bytes = new byte[(int)file.length()];
        //    DataInputStream in = new DataInputStream(new FileInputStream(file));
        //    in.readFully(bytes);
        //    Configuration conf = NutchConfiguration.create();
        StoreReadable parser = new StoreReadable();
        parser.setConf(localConf);
        //    WebPage page = WebPage.newBuilder().build();
        //    page.setBaseUrl(new Utf8(url));
        //    page.setContent(ByteBuffer.wrap(bytes));
        //    page.setContentType(new Utf8("text/html"));

        Parse parse = parser.getParse(url, camera);

        System.out.println("title: " + parse.getTitle());
        System.out.println("text: " + parse.getText());
        System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks()));

        store.put(rowKey, camera);
        store.flush();

        WebPage page2 = store.get(rowKey);

        String resultPage2 = page2.getReadable().toString();
        System.out.println("result2=" + resultPage2);

        store.execute(store.newQuery());
        store.close();

        String reverseUrl = TableUtil.reverseUrl(url);
        System.out.println("reversedUrl:" + reverseUrl);
    }

}