List of usage examples for org.apache.hadoop.conf Configuration getConfResourceAsInputStream
public InputStream getConfResourceAsInputStream(String name)
name
. From source file:cn.edu.hfut.dmic.webcollectorcluster.crawler.Crawler.java
@Override public Generator createGenerator(Generator generator) { if (interval == null) { interval = CrawlerConfiguration.create().getLong("generator.interval", -1); }//from ww w. ja v a 2s .c om try { Configuration conf = CrawlerConfiguration.create(); InputStream regexIs = conf.getConfResourceAsInputStream("regex"); BufferedReader br = new BufferedReader(new InputStreamReader(regexIs)); ArrayList<String> regexRules = new ArrayList<String>(); String line; while ((line = br.readLine()) != null) { regexRules.add(line); } //return new URLRegexFilter(generator, regexRules); return new URLRegexFilter(new IntervalFilter(generator, interval), regexRules); } catch (Exception ex) { LogUtils.getLogger().info("Exception", ex); return null; } }
From source file:com.reidin.ppd.listings.date.DateListingsFilter.java
License:Apache License
private HashMap<String, HashMap<String, String>> readRules(Configuration conf) throws IOException { BufferedReader reader = new BufferedReader( new InputStreamReader(conf.getConfResourceAsInputStream(conf.get(DATE_CONFIG_FILE)))); String line = null;//www . j a v a2 s . c om HashMap<String, HashMap<String, String>> map = new HashMap<String, HashMap<String, String>>(); while ((line = reader.readLine()) != null) { if (!line.startsWith("#") && line.contains("=") && line.contains("|")) { String[] strings = line.split("="); String[] keys = strings[0].split("\\|"); HashMap<String, String> values = map.get(keys[0]); if (values == null) { values = new HashMap<String, String>(); } values.put(keys[1], strings[1]); map.put(keys[0], values); } } return map; }
From source file:org.apache.nutch.exchange.Exchanges.java
License:Apache License
/** * Loads the configuration of each exchange. * * @param conf Nutch's configuration./*from www .j a va 2 s . co m*/ * @return An array with each exchange's configuration. */ private ExchangeConfig[] loadConfigurations(Configuration conf) { String filename = conf.get("exchanges.exchanges.file", "exchanges.xml"); InputSource inputSource = new InputSource(conf.getConfResourceAsInputStream(filename)); final List<ExchangeConfig> configList = new LinkedList<>(); try { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = factory.newDocumentBuilder(); Element rootElement = builder.parse(inputSource).getDocumentElement(); NodeList exchangeList = rootElement.getElementsByTagName("exchange"); for (int i = 0; i < exchangeList.getLength(); i++) { Element element = (Element) exchangeList.item(i); ExchangeConfig exchangeConfig = ExchangeConfig.getInstance(element); if ("default".equals(exchangeConfig.getClazz())) { this.defaultExchangeConfig = exchangeConfig; continue; } configList.add(exchangeConfig); } } catch (SAXException | IOException | ParserConfigurationException e) { LOG.error(e.toString()); } return configList.toArray(new ExchangeConfig[0]); }
From source file:org.apache.nutch.parse.ParsePluginsReader.java
License:Apache License
/** * Reads the <code>parse-plugins.xml</code> file and returns the * {@link #ParsePluginList} defined by it. * /*ww w .j a va 2s.com*/ * @return A {@link #ParsePluginList} specified by the * <code>parse-plugins.xml</code> file. * @throws Exception * If any parsing error occurs. */ public ParsePluginList parse(Configuration conf) { ParsePluginList pList = new ParsePluginList(); // open up the XML file DocumentBuilderFactory factory = null; DocumentBuilder parser = null; Document document = null; InputSource inputSource = null; InputStream ppInputStream = null; if (fParsePluginsFile != null) { URL parsePluginUrl = null; try { parsePluginUrl = new URL(fParsePluginsFile); ppInputStream = parsePluginUrl.openStream(); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Unable to load parse plugins file from URL " + "[" + fParsePluginsFile + "]. Reason is [" + e + "]"); } return pList; } } else { ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP)); } inputSource = new InputSource(ppInputStream); try { factory = DocumentBuilderFactory.newInstance(); parser = factory.newDocumentBuilder(); document = parser.parse(inputSource); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is [" + e + "]"); } return null; } Element parsePlugins = document.getDocumentElement(); // build up the alias hash map Map<String, String> aliases = getAliases(parsePlugins); // And store it on the parse plugin list pList.setAliases(aliases); // get all the mime type nodes NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType"); // iterate through the mime types for (int i = 0; i < mimeTypes.getLength(); i++) { Element mimeType = (Element) mimeTypes.item(i); String mimeTypeStr = mimeType.getAttribute("name"); // for each mimeType, get the plugin list NodeList pluginList = mimeType.getElementsByTagName("plugin"); // iterate through the plugins, add them in order read // OR if they have a special order="" attribute, then hold those in // a separate list, and then insert them into the final list at the // order specified if (pluginList != null && pluginList.getLength() > 0) { List<String> plugList = new ArrayList<String>(pluginList.getLength()); for (int j = 0; j < pluginList.getLength(); j++) { Element plugin = (Element) pluginList.item(j); String pluginId = plugin.getAttribute("id"); String extId = aliases.get(pluginId); if (extId == null) { // Assume an extension id is directly specified extId = pluginId; } String orderStr = plugin.getAttribute("order"); int order = -1; try { order = Integer.parseInt(orderStr); } catch (NumberFormatException ignore) { } if (order != -1) { plugList.add(order - 1, extId); } else { plugList.add(extId); } } // now add the plugin list and map it to this mimeType pList.setPluginList(mimeTypeStr, plugList); } else if (LOG.isWarnEnabled()) { LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: " + mimeTypeStr + ", continuing parse"); } } return pList; }
From source file:org.apache.nutch.protocol.http.proxy.api.HttpBase.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; this.webProtectSkip = conf.getBoolean(MyConstant.SKIP_WEB_CRAWL_PROTECT, false);// ?? this.proxyHost = conf.get("http.proxy.host"); this.proxyPort = conf.getInt("http.proxy.port", 8080); // ?IP/* w w w .j av a2 s .c o m*/ this.proxyReqMax = conf.getInt("http.proxy.reqmax", 500); this.useProxy = (proxyHost != null && proxyHost.length() > 0); this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 64 * 1024); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); this.accept = conf.get("http.accept", accept); // backward-compatible default setting this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); this.robots.setConf(conf); // NUTCH-1941: read list of alternating agent names if (conf.getBoolean("http.agent.rotate", false)) { String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); BufferedReader br = null; try { Reader reader = conf.getConfResourceAsReader(agentsFile); br = new BufferedReader(reader); userAgentNames = new ArrayList<String>(); String word = ""; while ((word = br.readLine()) != null) { if (!word.trim().isEmpty()) userAgentNames.add(word.trim()); } if (userAgentNames.size() == 0) { logger.warn("Empty list of user agents in http.agent.rotate.file {}", agentsFile); userAgentNames = null; } } catch (Exception e) { logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, StringUtils.stringifyException(e)); userAgentNames = null; } finally { if (br != null) { try { br.close(); } catch (IOException e) { // ignore } } } if (userAgentNames == null) { logger.warn("Falling back to fixed user agent set via property http.agent.name"); } } String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", "TLS_RSA_WITH_AES_256_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", "TLS_RSA_WITH_AES_128_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA", "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_DES_CBC_MD5"); tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); logConf(); InputStream is = null; BufferedReader dr = null; try { LOGGER.info("??........."); is = conf.getConfResourceAsInputStream("proxylist.conf"); if (is == null) { return; } dr = new BufferedReader(new InputStreamReader(is)); String tmp = null; while ((tmp = dr.readLine()) != null) { if (!"".equals(tmp) && !tmp.startsWith("#")) { proxyList.add(tmp); LOGGER.info(tmp); } } LOGGER.info("????"); if (!proxyList.isEmpty()) { this.useProxy = true; } } catch (Exception e) { logger.error("custom proxylist read error :", e); } finally { if (dr != null) { try { dr.close(); } catch (IOException e) { e.printStackTrace(); } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.apache.nutch.tools.PruneIndexTool.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length == 0) { usage();/*w ww . ja v a 2 s.co m*/ if (LOG.isFatalEnabled()) { LOG.fatal("Missing arguments"); } return; } File idx = new File(args[0]); if (!idx.isDirectory()) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Not a directory: " + idx); } return; } Vector paths = new Vector(); if (IndexReader.indexExists(idx)) { paths.add(idx); } else { // try and see if there are segments inside, with index dirs File[] dirs = idx.listFiles(new FileFilter() { public boolean accept(File f) { return f.isDirectory(); } }); if (dirs == null || dirs.length == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx); } return; } for (int i = 0; i < dirs.length; i++) { File sidx = new File(dirs[i], "index"); if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) { paths.add(sidx); } } if (paths.size() == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx + " or its subdirs."); } return; } } File[] indexes = (File[]) paths.toArray(new File[0]); boolean force = false; boolean dryrun = false; String qPath = null; String outPath = null; String fList = null; for (int i = 1; i < args.length; i++) { if (args[i].equals("-force")) { force = true; } else if (args[i].equals("-queries")) { qPath = args[++i]; } else if (args[i].equals("-output")) { outPath = args[++i]; } else if (args[i].equals("-showfields")) { fList = args[++i]; } else if (args[i].equals("-dryrun")) { dryrun = true; } else { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Unrecognized option: " + args[i]); } return; } } Vector cv = new Vector(); if (fList != null) { StringTokenizer st = new StringTokenizer(fList, ","); Vector tokens = new Vector(); while (st.hasMoreTokens()) tokens.add(st.nextToken()); String[] fields = (String[]) tokens.toArray(new String[0]); PruneChecker pc = new PrintFieldsChecker(System.out, fields); cv.add(pc); } if (outPath != null) { StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false); cv.add(luc); } PruneChecker[] checkers = null; if (cv.size() > 0) { checkers = (PruneChecker[]) cv.toArray(new PruneChecker[0]); } Query[] queries = null; InputStream is = null; if (qPath != null) { is = new FileInputStream(qPath); } else { Configuration conf = NutchConfiguration.create(); qPath = conf.get("prune.index.tool.queries"); is = conf.getConfResourceAsInputStream(qPath); } if (is == null) { if (LOG.isFatalEnabled()) { LOG.fatal("Can't load queries from " + qPath); } return; } try { queries = parseQueries(is); } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("Error parsing queries: " + e.getMessage()); } return; } try { PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, force, dryrun); pit.run(); } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("Error running PruneIndexTool: " + e.getMessage()); } return; } }
From source file:org.apache.nutch.util.MimeUtil.java
License:Apache License
public MimeUtil(Configuration conf) { tika = new Tika(); ObjectCache objectCache = ObjectCache.get(conf); MimeTypes mimeTypez = (MimeTypes) objectCache.getObject(MimeTypes.class.getName()); if (mimeTypez == null) { try {/*from www . ja va 2 s .co m*/ String customMimeTypeFile = conf.get("mime.types.file"); if (customMimeTypeFile != null && customMimeTypeFile.equals("") == false) { try { mimeTypez = MimeTypesFactory.create(conf.getConfResourceAsInputStream(customMimeTypeFile)); } catch (Exception e) { LOG.error("Can't load mime.types.file : " + customMimeTypeFile + " using Tika's default"); } } if (mimeTypez == null) mimeTypez = MimeTypes.getDefaultMimeTypes(); } catch (Exception e) { LOG.error("Exception in MimeUtil " + e.getMessage()); throw new RuntimeException(e); } objectCache.setObject(MimeTypes.class.getName(), mimeTypez); } this.mimeTypes = mimeTypez; this.mimeMagic = conf.getBoolean("mime.type.magic", true); }