Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.store.readable; import org.apache.avro.util.Utf8; import org.apache.gora.store.DataStore; import org.apache.hadoop.conf.Configuration; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.parse.*; import org.apache.nutch.parse.html.DOMBuilder; import org.apache.nutch.parse.html.DOMContentUtils; import org.apache.nutch.parse.html.HTMLMetaProcessor; import org.apache.nutch.parse.html.HtmlParser; import org.apache.nutch.protocol.Content; import org.apache.nutch.storage.ParseStatus; import org.apache.nutch.storage.WebPage; import org.apache.nutch.store.readable.utils.JsoupTokopedia; import org.apache.nutch.store.readable.utils.StorageUtilsPlugin; import org.apache.nutch.util.Bytes; import org.apache.nutch.util.EncodingDetector; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.TableUtil; import org.cyberneko.html.parsers.DOMFragmentParser; import org.json.JSONException; import org.json.JSONObject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.DOMException; import org.w3c.dom.DocumentFragment; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; public class StoreReadable extends HtmlParser { public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.store.readable.StoreReadable"); // I used 1000 bytes at first, but found that some documents have // meta tag well past the first 1000 bytes. // (e.g. http://cn.promo.yahoo.com/customcare/music.html) private static final int CHUNK_SIZE = 2000; // NUTCH-1006 Meta equiv with single quotes not accepted private static Pattern metaPattern = Pattern .compile("<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", Pattern.CASE_INSENSITIVE); private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); private static Pattern charsetPatternHTML5 = Pattern .compile("<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", Pattern.CASE_INSENSITIVE); private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); static { FIELDS.add(WebPage.Field.BASE_URL); FIELDS.add(WebPage.Field.READABLE); } private String parserImpl; /** * Given a <code>ByteBuffer</code> representing an html file of an * <em>unknown</em> encoding, read out 'charset' parameter in the meta tag * from the first <code>CHUNK_SIZE</code> bytes. * If there's no meta tag for Content-Type or no charset is specified, * the content is checked for a Unicode Byte Order Mark (BOM). * This will also cover non-byte oriented character encodings (UTF-16 only). * If no character set can be determined, * <code>null</code> is returned. <br /> * See also http://www.w3.org/International/questions/qa-html-encoding-declarations, * http://www.w3.org/TR/2011/WD-html5-diff-20110405/#character-encoding, and * http://www.w3.org/TR/REC-xml/#sec-guessing * <br /> * * @param content <code>ByteBuffer</code> representation of an html file */ private static String sniffCharacterEncoding(ByteBuffer content) { System.out.println( "[STORE-READABLE]sniffCharacterEncoding----------------------------------------------------------"); int length = Math.min(content.remaining(), CHUNK_SIZE); // We don't care about non-ASCII parts so that it's sufficient // to just inflate each byte to a 16-bit value by padding. // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into // {U+0041, U+0082, U+00B7}. String str = ""; try { str = new String(content.array(), content.arrayOffset() + content.position(), length, Charset.forName("ASCII").toString()); } catch (UnsupportedEncodingException e) { // code should never come here, but just in case... return null; } Matcher metaMatcher = metaPattern.matcher(str); String encoding = null; if (metaMatcher.find()) { Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); if (charsetMatcher.find()) encoding = new String(charsetMatcher.group(1)); } if (encoding == null) { // check for HTML5 meta charset metaMatcher = charsetPatternHTML5.matcher(str); if (metaMatcher.find()) { encoding = new String(metaMatcher.group(1)); } } if (encoding == null) { // check for BOM if (length >= 3 && content.get(0) == (byte) 0xEF && content.get(1) == (byte) 0xBB && content.get(2) == (byte) 0xBF) { encoding = "UTF-8"; } else if (length >= 2) { if (content.get(0) == (byte) 0xFF && content.get(1) == (byte) 0xFE) { encoding = "UTF-16LE"; } else if (content.get(0) == (byte) 0xFE && content.get(1) == (byte) 0xFF) { encoding = "UTF-16BE"; } } } return encoding; } private String defaultCharEncoding; private Configuration conf; private DOMContentUtils utils; private ParseFilters htmlParseFilters; private String cachingPolicy; public Parse getParse(String url, WebPage page) { HTMLMetaTags metaTags = new HTMLMetaTags(); System.out.println("[STORE-READABLE]getParse-------------------------------------------------------------"); String baseUrl = TableUtil.toString(page.getBaseUrl()); URL base; try { base = new URL(baseUrl); } catch (MalformedURLException e) { return ParseStatusUtils.getEmptyParse(e, getConf()); } String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; // parse the content DocumentFragment root; try { ByteBuffer contentInOctets = page.getContent(); InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets.array(), contentInOctets.arrayOffset() + contentInOctets.position(), contentInOctets.remaining())); EncodingDetector detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed"); String encoding = detector.guessEncoding(page, defaultCharEncoding); page.getMetadata().put(new Utf8(Metadata.ORIGINAL_CHAR_ENCODING), ByteBuffer.wrap(Bytes.toBytes(encoding))); page.getMetadata().put(new Utf8(Metadata.CHAR_ENCODING_FOR_CONVERSION), ByteBuffer.wrap(Bytes.toBytes(encoding))); input.setEncoding(encoding); if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); } root = parse(input); } catch (IOException e) { LOG.error("Failed with the following IOException: ", e); return ParseStatusUtils.getEmptyParse(e, getConf()); } catch (DOMException e) { LOG.error("Failed with the following DOMException: ", e); return ParseStatusUtils.getEmptyParse(e, getConf()); } catch (SAXException e) { LOG.error("Failed with the following SAXException: ", e); return ParseStatusUtils.getEmptyParse(e, getConf()); } catch (Exception e) { LOG.error("Failed with the following Exception: ", e); return ParseStatusUtils.getEmptyParse(e, getConf()); } // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuilder sb = new StringBuilder(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " + url); } } ParseStatus status = ParseStatus.newBuilder().build(); status.setMajorCode((int) ParseStatusCodes.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode((int) ParseStatusCodes.SUCCESS_REDIRECT); status.getArgs().add(new Utf8(metaTags.getRefreshHref().toString())); status.getArgs().add(new Utf8(Integer.toString(metaTags.getRefreshTime()))); } String strJo = addJsonToPage(url, page); // storeJsonToSchema(url, page ,strJo); page.setReadable(new Utf8(strJo)); Parse parse = new Parse(text, title, outlinks, status, strJo); parse = htmlParseFilters.filter(url, page, parse, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache page.getMetadata().put(new Utf8(Nutch.CACHING_FORBIDDEN_KEY), ByteBuffer.wrap(Bytes.toBytes(cachingPolicy))); } parse.setJsonRead(strJo); return parse; } private String addJsonToPage(String url, WebPage page) { String jsonStr = null; //====================the part of convert to json Map<CharSequence, ByteBuffer> metadataByte = page.getMetadata(); Metadata metaData = new Metadata(); for (CharSequence cs : metadataByte.keySet()) { String csString = cs.toString(); String value = String.valueOf(metadataByte.get(cs).array()); metaData.add(csString, value); } Content r = new Content(url, url, page.getContent().array(), page.getContentType().toString(), metaData, conf); JsoupTokopedia jsT = new JsoupTokopedia(); try { JSONObject jo = jsT.constructJson(r); jsonStr = String.valueOf(jo); } catch (JSONException e) { e.printStackTrace(); } return jsonStr; } private DocumentFragment parse(InputSource input) throws Exception { System.out.println("[STORE-READABLE]---------------------------------------------------parse"); if (parserImpl.equalsIgnoreCase("tagsoup")) return parseTagSoup(input); else return parseNeko(input); } private DocumentFragment parseTagSoup(InputSource input) throws Exception { System.out.println("[STORE-READABLE]---------------------------------------------------parseTagSoup"); HTMLDocumentImpl doc = new HTMLDocumentImpl(); DocumentFragment frag = doc.createDocumentFragment(); DOMBuilder builder = new DOMBuilder(doc, frag); org.ccil.cowan.tagsoup.Parser reader = new org.ccil.cowan.tagsoup.Parser(); reader.setContentHandler(builder); reader.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true); reader.setFeature(org.ccil.cowan.tagsoup.Parser.bogonsEmptyFeature, false); reader.setProperty("http://xml.org/sax/properties/lexical-handler", builder); reader.parse(input); return frag; } private DocumentFragment parseNeko(InputSource input) throws Exception { System.out.println("[STORE-READABLE]----------------------------------------------------parseNeko"); DOMFragmentParser parser = new DOMFragmentParser(); try { parser.setFeature("http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe", true); parser.setFeature("http://cyberneko.org/html/features/augmentations", true); parser.setProperty("http://cyberneko.org/html/properties/default-encoding", defaultCharEncoding); parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true); parser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false); parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true); parser.setFeature("http://cyberneko.org/html/features/report-errors", LOG.isTraceEnabled()); } catch (SAXException e) { } // convert Document to DocumentFragment HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment res = doc.createDocumentFragment(); DocumentFragment frag = doc.createDocumentFragment(); parser.parse(input, frag); res.appendChild(frag); try { while (true) { frag = doc.createDocumentFragment(); parser.parse(input, frag); if (!frag.hasChildNodes()) break; if (LOG.isInfoEnabled()) { LOG.info(" - new frag, " + frag.getChildNodes().getLength() + " nodes."); } res.appendChild(frag); } } catch (Exception x) { LOG.error("Failed with the following Exception: ", x); } ; return res; } public void setConf(Configuration conf) { this.conf = conf; this.htmlParseFilters = new ParseFilters(getConf()); this.parserImpl = getConf().get("parser.html.impl", "neko"); this.defaultCharEncoding = getConf().get("parser.character.encoding.default", "windows-1252"); this.utils = new DOMContentUtils(conf); this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT); } public Configuration getConf() { return this.conf; } public void storeJsonToSchema(String url, WebPage page, String jsonPar) { // try { // // String rowKey=TableUtil.reverseUrl(url); // DataStore<String, WebPage> store = StorageUtilsPlugin.createWebStore(getConf(), // String.class, WebPage.class, null); // // page.setReadable(new Utf8(jsonPar)); // if(jsonPar.length()>2) { // store.put(rowKey, page); // } // store.execute(store.newQuery()); // // store.flush(); // store.close(); // // } catch (ClassNotFoundException e) { // e.printStackTrace(); // } catch (GoraException e) { // e.printStackTrace(); // } catch (MalformedURLException e) { // e.printStackTrace(); // } } @Override public Collection<WebPage.Field> getFields() { return FIELDS; } public static void main(String[] args) throws Exception { String rowKey = "com.tokopedia.www:https/xiaomi-mobile/xiaomi-yi-action-camera-paket-komplit-16gb"; String id = "webpage"; String schemaPrefix = "TestCrawl"; Configuration localConf = NutchConfiguration.create(); DataStore<String, WebPage> store = StorageUtilsPlugin.createWebStore(localConf, String.class, WebPage.class, schemaPrefix); // WebPage camera = store.get(rowKey); // LOG.setLevel(Level.FINE); // String name = args[0]; String url = String.valueOf(camera.getBaseUrl());//"file:"+name; // File file = new File(name); // byte[] bytes = new byte[(int)file.length()]; // DataInputStream in = new DataInputStream(new FileInputStream(file)); // in.readFully(bytes); // Configuration conf = NutchConfiguration.create(); StoreReadable parser = new StoreReadable(); parser.setConf(localConf); // WebPage page = WebPage.newBuilder().build(); // page.setBaseUrl(new Utf8(url)); // page.setContent(ByteBuffer.wrap(bytes)); // page.setContentType(new Utf8("text/html")); Parse parse = parser.getParse(url, camera); System.out.println("title: " + parse.getTitle()); System.out.println("text: " + parse.getText()); System.out.println("outlinks: " + Arrays.toString(parse.getOutlinks())); store.put(rowKey, camera); store.flush(); WebPage page2 = store.get(rowKey); String resultPage2 = page2.getReadable().toString(); System.out.println("result2=" + resultPage2); store.execute(store.newQuery()); store.close(); String reverseUrl = TableUtil.reverseUrl(url); System.out.println("reversedUrl:" + reverseUrl); } }