List of usage examples for org.apache.hadoop.conf Configuration getConfResourceAsReader
public Reader getConfResourceAsReader(String name)
name
. From source file:org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; inputFilePath = conf.get(TRAINFILE_MODELFILTER); dictionaryFile = conf.get(DICTFILE_MODELFILTER); if (inputFilePath == null || inputFilePath.trim().length() == 0 || dictionaryFile == null || dictionaryFile.trim().length() == 0) { String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist"; if (LOG.isErrorEnabled()) { LOG.error(message);/* w w w .ja va2s .c o m*/ } throw new IllegalArgumentException(message); } try { if ((FileSystem.get(conf).exists(new Path(inputFilePath))) || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) { String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or " + dictionaryFile + " not found!"; if (LOG.isErrorEnabled()) { LOG.error(message); } throw new IllegalArgumentException(message); } BufferedReader br = null; String CurrentLine; Reader reader = conf.getConfResourceAsReader(dictionaryFile); br = new BufferedReader(reader); while ((CurrentLine = br.readLine()) != null) { wordlist.add(CurrentLine); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); } try { train(); } catch (Exception e) { LOG.error("Error occured while training:: " + StringUtils.stringifyException(e)); } }
From source file:org.apache.nutch.parsefilter.naivebayes.Train.java
License:Apache License
public static void start(String filepath) throws IOException { // two classes 0/irrelevant and 1/relevant // calculate the total number of instances/examples per class, word count in // each class and for each class a word:frequency map int numof_ir = 0; int numof_r = 0; int numwords_ir = 0; int numwords_r = 0; HashSet<String> uniquewords = new HashSet<String>(); HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>(); HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>(); String line = ""; String target = ""; String[] linearray = null;//from w w w .ja v a2 s . com // read the line Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(configuration); BufferedReader bufferedReader = new BufferedReader(configuration.getConfResourceAsReader(filepath)); while ((line = bufferedReader.readLine()) != null) { target = line.split("\t")[0]; line = replacefirstoccuranceof(target + "\t", line); linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" "); // update the data structures if (target.equals("0")) { numof_ir += 1; numwords_ir += linearray.length; for (int i = 0; i < linearray.length; i++) { uniquewords.add(linearray[i]); updateHashMap(wordfreq_ir, linearray[i]); } } else { numof_r += 1; numwords_r += linearray.length; for (int i = 0; i < linearray.length; i++) { uniquewords.add(linearray[i]); updateHashMap(wordfreq_r, linearray[i]); } } } // write the model file Path path = new Path("naivebayes-model"); Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); writer.write(String.valueOf(uniquewords.size()) + "\n"); writer.write("0\n"); writer.write(String.valueOf(numof_ir) + "\n"); writer.write(String.valueOf(numwords_ir) + "\n"); writer.write(flattenHashMap(wordfreq_ir) + "\n"); writer.write("1\n"); writer.write(String.valueOf(numof_r) + "\n"); writer.write(String.valueOf(numwords_r) + "\n"); writer.write(flattenHashMap(wordfreq_r) + "\n"); writer.close(); bufferedReader.close(); }
From source file:org.apache.nutch.parsefilter.regex.RegexParseFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; // get the extensions for domain urlfilter String pluginName = "parsefilter-regex"; Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.class.getName()) .getExtensions();/*from ww w. j ava 2 s . c o m*/ for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; if (extension.getDescriptor().getPluginId().equals(pluginName)) { attributeFile = extension.getAttribute("file"); break; } } // handle blank non empty input if (attributeFile != null && attributeFile.trim().equals("")) { attributeFile = null; } if (attributeFile != null) { if (LOG.isInfoEnabled()) { LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile); } } else { if (LOG.isWarnEnabled()) { LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + pluginName); } } // domain file and attribute "file" take precedence if defined String file = conf.get("parsefilter.regex.file"); String stringRules = conf.get("parsefilter.regex.rules"); if (regexFile != null) { file = regexFile; } else if (attributeFile != null) { file = attributeFile; } Reader reader = null; if (stringRules != null) { // takes precedence over files reader = new StringReader(stringRules); } else { reader = conf.getConfResourceAsReader(file); } try { if (reader == null) { reader = new FileReader(file); } readConfiguration(reader); } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } }
From source file:org.apache.nutch.protocol.http.proxy.api.HttpBase.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; this.webProtectSkip = conf.getBoolean(MyConstant.SKIP_WEB_CRAWL_PROTECT, false);// ?? this.proxyHost = conf.get("http.proxy.host"); this.proxyPort = conf.getInt("http.proxy.port", 8080); // ?IP/*from w w w .ja v a 2 s . com*/ this.proxyReqMax = conf.getInt("http.proxy.reqmax", 500); this.useProxy = (proxyHost != null && proxyHost.length() > 0); this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 64 * 1024); this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); this.accept = conf.get("http.accept", accept); // backward-compatible default setting this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true); this.robots.setConf(conf); // NUTCH-1941: read list of alternating agent names if (conf.getBoolean("http.agent.rotate", false)) { String agentsFile = conf.get("http.agent.rotate.file", "agents.txt"); BufferedReader br = null; try { Reader reader = conf.getConfResourceAsReader(agentsFile); br = new BufferedReader(reader); userAgentNames = new ArrayList<String>(); String word = ""; while ((word = br.readLine()) != null) { if (!word.trim().isEmpty()) userAgentNames.add(word.trim()); } if (userAgentNames.size() == 0) { logger.warn("Empty list of user agents in http.agent.rotate.file {}", agentsFile); userAgentNames = null; } } catch (Exception e) { logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile, StringUtils.stringifyException(e)); userAgentNames = null; } finally { if (br != null) { try { br.close(); } catch (IOException e) { // ignore } } } if (userAgentNames == null) { logger.warn("Falling back to fixed user agent set via property http.agent.name"); } } String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", "TLS_RSA_WITH_AES_256_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", "TLS_RSA_WITH_AES_128_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA", "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_DES_CBC_MD5"); tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); logConf(); InputStream is = null; BufferedReader dr = null; try { LOGGER.info("??........."); is = conf.getConfResourceAsInputStream("proxylist.conf"); if (is == null) { return; } dr = new BufferedReader(new InputStreamReader(is)); String tmp = null; while ((tmp = dr.readLine()) != null) { if (!"".equals(tmp) && !tmp.startsWith("#")) { proxyList.add(tmp); LOGGER.info(tmp); } } LOGGER.info("????"); if (!proxyList.isEmpty()) { this.useProxy = true; } } catch (Exception e) { logger.error("custom proxylist read error :", e); } finally { if (dr != null) { try { dr.close(); } catch (IOException e) { e.printStackTrace(); } } if (is != null) { try { is.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:org.apache.nutch.protocol.selenium.RegexURLFilter.java
License:Apache License
/** * Rules specified as a config property will override rules specified as a * config file./*from w w w. j ava 2 s .c om*/ */ protected Reader getRulesReader(Configuration conf) throws IOException { String fileRules = conf.get(URLFILTER_REGEX_FILE); return conf.getConfResourceAsReader(fileRules); }
From source file:org.apache.nutch.scoring.regex.RegexAnalysisScoringFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; String fileRules = conf.get("scoring.regex.file"); Reader reader = conf.getConfResourceAsReader(fileRules); try {/*from ww w . j a va 2 s . co m*/ regexScoreMap = readRules(reader); } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } throw new RuntimeException(e.getMessage(), e); } }
From source file:org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel.java
License:Apache License
@Override public void setConf(Configuration conf) { this.conf = conf; goldStandardDocPath = conf.get("scoring.similarity.model.path"); Reader reader = conf.getConfResourceAsReader(goldStandardDocPath); try {//from ww w . j av a2 s. c om String fileContent = IOUtils.toString(reader); if (goldStandardDocVect == null) { goldStandardDocVect = new DocumentVector(fileContent, conf); LOG.info("Creating DocVector from path - {}", goldStandardDocPath); } } catch (IOException e) { LOG.error("Failed to create Document vector : {}", StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.apache.nutch.scoring.similarity.cosine.DocumentVector.java
License:Apache License
private void populateStopWordSet(Configuration conf) { String stopWordFilePath = conf.get("scoring.similarity.stopword.file"); Reader reader = conf.getConfResourceAsReader(stopWordFilePath); try {/*w ww .java 2s. c om*/ LOG.info("Populating stopwords from {}", stopWordFilePath); String[] stopWordList1 = IOUtils.toString(reader).split("\n"); for (String stopWord : stopWordList1) { stopWord = stopWord.trim(); stopwordSet.add(stopWord); } } catch (IOException e) { LOG.error("Failed to populate stopwords : {}", StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.apache.nutch.scoring.similarity.cosine.Model.java
License:Apache License
public static synchronized void createModel(Configuration conf) throws IOException { if (isModelCreated) { LOG.info("Model exists, skipping model creation"); return;// w w w .j a v a 2 s . c o m } LOG.info("Creating Cosine model"); try { //If user has specified a stopword file other than the template if (!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) { stopWords = new ArrayList<String>(); String stopWord; BufferedReader br = new BufferedReader( conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file")))); while ((stopWord = br.readLine()) != null) { stopWords.add(stopWord); } LOG.info("Loaded custom stopwords from {}", conf.get("scoring.similarity.stopword.file")); } int[] ngramArr = retrieveNgrams(conf); int mingram = ngramArr[0]; int maxgram = ngramArr[1]; LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram); // TODO : Allow for corpus of documents to be provided as gold standard. String line; StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader( conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file")))); while ((line = br.readLine()) != null) { sb.append(line); } DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram); if (goldStandard != null) docVectors.add(goldStandard); else { throw new Exception("Could not create DocVector for goldstandard"); } } catch (Exception e) { LOG.warn("Failed to add {} to model : {}", conf.get("cosine.goldstandard.file", "goldstandard.txt.template"), StringUtils.stringifyException(e)); } if (docVectors.size() > 0) { LOG.info("Cosine model creation complete"); isModelCreated = true; } else LOG.info("Cosine model creation failed"); }
From source file:org.apache.nutch.scoring.similarity.DocumentVector.java
License:Apache License
private void removeStopWords(Configuration conf) { String stopWordFilePath = conf.get("scoring.similarity.stopword.file"); Reader reader = conf.getConfResourceAsReader(stopWordFilePath); try {//from w w w .j ava 2s . co m String[] stopWordList1 = IOUtils.toString(reader).split("\n"); for (String stopWord : stopWordList1) { stopWord = stopWord.trim(); if (termFreqVect.containsKey(stopWord)) { termFreqVect.remove(stopWord); } } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }