List of usage examples for org.apache.hadoop.conf Configuration getConfResourceAsReader
public Reader getConfResourceAsReader(String name)
name
. From source file:com.atlantbh.nutch.filter.index.omit.config.OmitIndexingFilterConfiguration.java
License:Apache License
public static OmitIndexingFilterConfiguration getInstance(Configuration configuration) { try {//from w ww .ja v a2 s . co m // Get configuration from Nutch /conf folder Reader configReader = configuration .getConfResourceAsReader(configuration.get(CONFIG_FILE_PATH_PROPERTY)); // Initialize JAXB JAXBContext context = JAXBContext.newInstance(new Class[] { OmitIndexingFilterConfiguration.class, OmitIndexingFilterConfigurationEntry.class, FilteringType.class, Target.class }); Unmarshaller unmarshaller = context.createUnmarshaller(); // Initialize configuration OmitIndexingFilterConfiguration xPathFilterConfiguration = (OmitIndexingFilterConfiguration) unmarshaller .unmarshal(configReader); return xPathFilterConfiguration; } catch (JAXBException e) { log.error("Configuration initialization error!"); } return null; }
From source file:com.atlantbh.nutch.index.alternativedataflow.conf.AlternativeDataFlowIndexingFilterConfiguration.java
License:Apache License
public static AlternativeDataFlowIndexingFilterConfiguration getInstance(Configuration configuration) { try {//from ww w . j ava 2s .c o m // Get configuration from Nutch /conf folder Reader configReader = configuration .getConfResourceAsReader(configuration.get(CONFIG_FILE_PATH_PROPERTY)); // Initialize JAXB JAXBContext context = JAXBContext.newInstance( new Class[] { AlternativeDataFlowIndexingFilterConfiguration.class, Entry.class, Field.class }); Unmarshaller unmarshaller = context.createUnmarshaller(); // Initialize configuration AlternativeDataFlowIndexingFilterConfiguration xPathFilterConfiguration = (AlternativeDataFlowIndexingFilterConfiguration) unmarshaller .unmarshal(configReader); return xPathFilterConfiguration; } catch (JAXBException e) { log.error("Configuration initialization error!"); } return null; }
From source file:de.informera.dev.nutchManager.thirdParty.RegexURLFilter.java
License:Apache License
/** * Rules specified as a config property will override rules specified * as a config file.// www . j ava 2s.c om */ protected Reader getRulesReader(Configuration conf) throws IOException { String stringRules = conf.get(URLFILTER_REGEX_RULES); if (stringRules != null) { return new StringReader(stringRules); } String fileRules = conf.get(URLFILTER_REGEX_FILE); return conf.getConfResourceAsReader(fileRules); }
From source file:org.apache.nutch.analysis.CommonGrams.java
License:Apache License
/** Construct using the provided config file. */ private void init(Configuration conf) { // First, try to retrieve some commonTerms cached in configuration. commonTerms = (HashMap) conf.getObject(KEY); if (commonTerms != null) { return;//from ww w. ja v a 2 s . c o m } // Otherwise, read the terms.file try { commonTerms = new HashMap(); Reader reader = conf.getConfResourceAsReader(conf.get("analysis.common.terms.file")); BufferedReader in = new BufferedReader(reader); String line; while ((line = in.readLine()) != null) { line = line.trim(); if (line.startsWith("#") || "".equals(line)) // skip comments continue; TokenStream ts = new NutchDocumentTokenizer(new StringReader(line)); Token token = ts.next(); if (token == null) { if (LOG.isWarnEnabled()) { LOG.warn("Line does not contain a field name: " + line); } continue; } String field = token.termText(); token = ts.next(); if (token == null) { if (LOG.isWarnEnabled()) { LOG.warn("Line contains only a field name, no word: " + line); } continue; } String gram = token.termText(); while ((token = ts.next()) != null) { gram = gram + SEPARATOR + token.termText(); } HashSet table = (HashSet) commonTerms.get(field); if (table == null) { table = new HashSet(); commonTerms.put(field, table); } table.add(gram); } conf.setObject(KEY, commonTerms); } catch (IOException e) { throw new RuntimeException(e.toString()); } }
From source file:org.apache.nutch.crawl.MimeAdaptiveFetchSchedule.java
License:Apache License
public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) return;/*w w w. j av a 2 s . c o m*/ // Read and set the default INC and DEC rates in case we cannot set values based on MIME-type defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f); defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f); // Where's the mime/factor file? Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, "adaptive-mimetypes.txt")); try { readMimeFile(mimeFile); } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } }
From source file:org.apache.nutch.indexer.filter.MimeTypeIndexingFilter.java
License:Apache License
@Override public void setConf(Configuration conf) { this.conf = conf; MIME = new MimeUtil(conf); // load the file of the values String file = conf.get(MIMEFILTER_REGEX_FILE, ""); if (file != null) { if (file.isEmpty()) { LOG.warn(//from w w w . j a v a 2s.com String.format("Missing %s property, ALL mimetypes will be allowed", MIMEFILTER_REGEX_FILE)); } else { Reader reader = conf.getConfResourceAsReader(file); try { readConfiguration(reader); } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } throw new RuntimeException(e.getMessage(), e); } } } }
From source file:org.apache.nutch.indexer.urlfilter.UrlIndexingFilter.java
License:Apache License
@Override protected Reader getRulesReader(Configuration conf) throws IOException { String fileRules = conf.get(URLINDEXINGFILTER_REGEX_FILE); return conf.getConfResourceAsReader(fileRules); }
From source file:org.apache.nutch.indexwriter.elastic.ElasticIndexWriter.java
License:Apache License
@Override public void open(Configuration job) throws IOException { clusterName = job.get(ElasticConstants.CLUSTER); host = job.get(ElasticConstants.HOST); port = job.getInt(ElasticConstants.PORT, 9300); Builder settingsBuilder = ImmutableSettings.settingsBuilder().classLoader(Settings.class.getClassLoader()); BufferedReader reader = new BufferedReader(job.getConfResourceAsReader("elasticsearch.conf")); String line;/* ww w .java 2 s . co m*/ String parts[]; while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { line.trim(); parts = line.split("="); if (parts.length == 2) { settingsBuilder.put(parts[0].trim(), parts[1].trim()); } } } if (StringUtils.isNotBlank(clusterName)) settingsBuilder.put("cluster.name", clusterName); // Set the cluster name and build the settings Settings settings = settingsBuilder.build(); // Prefer TransportClient if (host != null && port > 1) { client = new TransportClient(settings).addTransportAddress(new InetSocketTransportAddress(host, port)); } else if (clusterName != null) { node = nodeBuilder().settings(settings).client(true).node(); client = node.client(); } bulk = client.prepareBulk(); defaultIndex = job.get(ElasticConstants.INDEX, "nutch"); maxBulkDocs = job.getInt(ElasticConstants.MAX_BULK_DOCS, DEFAULT_MAX_BULK_DOCS); maxBulkLength = job.getInt(ElasticConstants.MAX_BULK_LENGTH, DEFAULT_MAX_BULK_LENGTH); }
From source file:org.apache.nutch.indexwriter.elastic2.ElasticIndexWriter.java
License:Apache License
@Override public void open(Configuration job) throws IOException { clusterName = job.get(ElasticConstants.CLUSTER); host = job.get(ElasticConstants.HOST); port = job.getInt(ElasticConstants.PORT, 9300); Builder settingsBuilder = Settings.builder(); BufferedReader reader = new BufferedReader(job.getConfResourceAsReader("elasticsearch.conf")); String line;/*w w w . j ava2s .c o m*/ String parts[]; while ((line = reader.readLine()) != null) { if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { line.trim(); parts = line.split("="); if (parts.length == 2) { settingsBuilder.put(parts[0].trim(), parts[1].trim()); } } } if (StringUtils.isNotBlank(clusterName)) settingsBuilder.put("cluster.name", clusterName); // Set the cluster name and build the settings Settings settings = settingsBuilder.build(); // Prefer TransportClient if (host != null && port > 1) { client = TransportClient.builder().settings(settings).build() .addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(host), port)); } else if (clusterName != null) { node = nodeBuilder().settings(settings).client(true).node(); client = node.client(); } bulk = client.prepareBulk(); defaultIndex = job.get(ElasticConstants.INDEX, "nutch"); maxBulkDocs = job.getInt(ElasticConstants.MAX_BULK_DOCS, DEFAULT_MAX_BULK_DOCS); maxBulkLength = job.getInt(ElasticConstants.MAX_BULK_LENGTH, DEFAULT_MAX_BULK_LENGTH); }
From source file:org.apache.nutch.net.RegexURLFilter.java
License:Apache License
/** * Rules specified as a config property will override rules specified as a * config file./*ww w .ja v a 2s . c o m*/ */ protected Reader getRulesReader(Configuration conf) throws IOException { String stringRules = conf.get(URLFILTER_REGEX_RULES); if (stringRules != null) { if (LOG.isDebugEnabled()) { // LOG.debug("Url filter regex rules : \n" + stringRules); } return new StringReader(stringRules); } String fileRules = conf.get(URLFILTER_REGEX_FILE); return conf.getConfResourceAsReader(fileRules); }