Example usage for org.apache.hadoop.conf Configuration getConfResourceAsReader

List of usage examples for org.apache.hadoop.conf Configuration getConfResourceAsReader

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration getConfResourceAsReader.

Prototype

public Reader getConfResourceAsReader(String name) 

Source Link

Document

Get a Reader attached to the configuration resource with the given name.

Usage

From source file:org.apache.nutch.parsefilter.naivebayes.NaiveBayesParseFilter.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;
    inputFilePath = conf.get(TRAINFILE_MODELFILTER);
    dictionaryFile = conf.get(DICTFILE_MODELFILTER);
    if (inputFilePath == null || inputFilePath.trim().length() == 0 || dictionaryFile == null
            || dictionaryFile.trim().length() == 0) {
        String message = "ParseFilter: NaiveBayes: trainfile or wordlist not set in the parsefilte.naivebayes.trainfile or parsefilte.naivebayes.wordlist";
        if (LOG.isErrorEnabled()) {
            LOG.error(message);/* w  w  w  .ja va2s  .c  o  m*/
        }
        throw new IllegalArgumentException(message);
    }
    try {
        if ((FileSystem.get(conf).exists(new Path(inputFilePath)))
                || (FileSystem.get(conf).exists(new Path(dictionaryFile)))) {
            String message = "ParseFilter: NaiveBayes: " + inputFilePath + " or " + dictionaryFile
                    + " not found!";
            if (LOG.isErrorEnabled()) {
                LOG.error(message);
            }
            throw new IllegalArgumentException(message);
        }

        BufferedReader br = null;

        String CurrentLine;
        Reader reader = conf.getConfResourceAsReader(dictionaryFile);
        br = new BufferedReader(reader);
        while ((CurrentLine = br.readLine()) != null) {
            wordlist.add(CurrentLine);
        }

    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
    }
    try {
        train();
    } catch (Exception e) {

        LOG.error("Error occured while training:: " + StringUtils.stringifyException(e));

    }

}

From source file:org.apache.nutch.parsefilter.naivebayes.Train.java

License:Apache License

public static void start(String filepath) throws IOException {

    // two classes 0/irrelevant and 1/relevant

    // calculate the total number of instances/examples per class, word count in
    // each class and for each class a word:frequency map

    int numof_ir = 0;
    int numof_r = 0;
    int numwords_ir = 0;
    int numwords_r = 0;
    HashSet<String> uniquewords = new HashSet<String>();
    HashMap<String, Integer> wordfreq_ir = new HashMap<String, Integer>();
    HashMap<String, Integer> wordfreq_r = new HashMap<String, Integer>();

    String line = "";
    String target = "";
    String[] linearray = null;//from w w w .ja v a2 s . com

    // read the line
    Configuration configuration = new Configuration();
    FileSystem fs = FileSystem.get(configuration);

    BufferedReader bufferedReader = new BufferedReader(configuration.getConfResourceAsReader(filepath));

    while ((line = bufferedReader.readLine()) != null) {

        target = line.split("\t")[0];

        line = replacefirstoccuranceof(target + "\t", line);

        linearray = line.replaceAll("[^a-zA-Z ]", "").toLowerCase().split(" ");

        // update the data structures
        if (target.equals("0")) {

            numof_ir += 1;
            numwords_ir += linearray.length;
            for (int i = 0; i < linearray.length; i++) {
                uniquewords.add(linearray[i]);
                updateHashMap(wordfreq_ir, linearray[i]);
            }
        } else {

            numof_r += 1;
            numwords_r += linearray.length;
            for (int i = 0; i < linearray.length; i++) {
                uniquewords.add(linearray[i]);
                updateHashMap(wordfreq_r, linearray[i]);
            }

        }

    }

    // write the model file

    Path path = new Path("naivebayes-model");

    Writer writer = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

    writer.write(String.valueOf(uniquewords.size()) + "\n");
    writer.write("0\n");
    writer.write(String.valueOf(numof_ir) + "\n");
    writer.write(String.valueOf(numwords_ir) + "\n");
    writer.write(flattenHashMap(wordfreq_ir) + "\n");
    writer.write("1\n");
    writer.write(String.valueOf(numof_r) + "\n");
    writer.write(String.valueOf(numwords_r) + "\n");
    writer.write(flattenHashMap(wordfreq_r) + "\n");

    writer.close();

    bufferedReader.close();

}

From source file:org.apache.nutch.parsefilter.regex.RegexParseFilter.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;

    // get the extensions for domain urlfilter
    String pluginName = "parsefilter-regex";
    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.class.getName())
            .getExtensions();/*from   ww w.  j  ava 2  s . c o  m*/
    for (int i = 0; i < extensions.length; i++) {
        Extension extension = extensions[i];
        if (extension.getDescriptor().getPluginId().equals(pluginName)) {
            attributeFile = extension.getAttribute("file");
            break;
        }
    }

    // handle blank non empty input
    if (attributeFile != null && attributeFile.trim().equals("")) {
        attributeFile = null;
    }

    if (attributeFile != null) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile);
        }
    } else {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin " + pluginName);
        }
    }

    // domain file and attribute "file" take precedence if defined
    String file = conf.get("parsefilter.regex.file");
    String stringRules = conf.get("parsefilter.regex.rules");
    if (regexFile != null) {
        file = regexFile;
    } else if (attributeFile != null) {
        file = attributeFile;
    }
    Reader reader = null;
    if (stringRules != null) { // takes precedence over files
        reader = new StringReader(stringRules);
    } else {
        reader = conf.getConfResourceAsReader(file);
    }
    try {
        if (reader == null) {
            reader = new FileReader(file);
        }
        readConfiguration(reader);
    } catch (IOException e) {
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
    }
}

From source file:org.apache.nutch.protocol.http.proxy.api.HttpBase.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;
    this.webProtectSkip = conf.getBoolean(MyConstant.SKIP_WEB_CRAWL_PROTECT, false);// ??
    this.proxyHost = conf.get("http.proxy.host");
    this.proxyPort = conf.getInt("http.proxy.port", 8080);
    // ?IP/*from   w  w  w  .ja  v a 2 s  .  com*/
    this.proxyReqMax = conf.getInt("http.proxy.reqmax", 500);

    this.useProxy = (proxyHost != null && proxyHost.length() > 0);
    this.timeout = conf.getInt("http.timeout", 10000);
    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
    this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"),
            conf.get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email"));
    this.acceptLanguage = conf.get("http.accept.language", acceptLanguage);
    this.accept = conf.get("http.accept", accept);
    // backward-compatible default setting
    this.useHttp11 = conf.getBoolean("http.useHttp11", false);
    this.responseTime = conf.getBoolean("http.store.responsetime", true);
    this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
    this.robots.setConf(conf);

    // NUTCH-1941: read list of alternating agent names
    if (conf.getBoolean("http.agent.rotate", false)) {
        String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
        BufferedReader br = null;
        try {
            Reader reader = conf.getConfResourceAsReader(agentsFile);
            br = new BufferedReader(reader);
            userAgentNames = new ArrayList<String>();
            String word = "";
            while ((word = br.readLine()) != null) {
                if (!word.trim().isEmpty())
                    userAgentNames.add(word.trim());
            }

            if (userAgentNames.size() == 0) {
                logger.warn("Empty list of user agents in http.agent.rotate.file {}", agentsFile);
                userAgentNames = null;
            }

        } catch (Exception e) {
            logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
                    StringUtils.stringifyException(e));
            userAgentNames = null;
        } finally {
            if (br != null) {
                try {
                    br.close();
                } catch (IOException e) {
                    // ignore
                }
            }
        }
        if (userAgentNames == null) {
            logger.warn("Falling back to fixed user agent set via property http.agent.name");
        }
    }

    String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1",
            "SSLv3");
    String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
            "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
            "TLS_RSA_WITH_AES_256_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
            "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
            "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
            "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
            "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
            "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
            "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
            "TLS_RSA_WITH_AES_128_CBC_SHA256", "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
            "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
            "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
            "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
            "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
            "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
            "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA",
            "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA",
            "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA",
            "SSL_RSA_WITH_3DES_EDE_CBC_SHA", "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
            "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
            "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
            "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", "TLS_ECDHE_ECDSA_WITH_NULL_SHA",
            "TLS_ECDHE_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
            "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", "SSL_RSA_WITH_DES_CBC_SHA",
            "SSL_DHE_RSA_WITH_DES_CBC_SHA", "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
            "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", "TLS_KRB5_WITH_3DES_EDE_CBC_MD5",
            "TLS_KRB5_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_DES_CBC_MD5");

    tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
    tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));

    logConf();

    InputStream is = null;
    BufferedReader dr = null;
    try {
        LOGGER.info("??.........");
        is = conf.getConfResourceAsInputStream("proxylist.conf");
        if (is == null) {
            return;
        }
        dr = new BufferedReader(new InputStreamReader(is));
        String tmp = null;
        while ((tmp = dr.readLine()) != null) {
            if (!"".equals(tmp) && !tmp.startsWith("#")) {
                proxyList.add(tmp);
                LOGGER.info(tmp);
            }
        }
        LOGGER.info("????");
        if (!proxyList.isEmpty()) {
            this.useProxy = true;
        }
    } catch (Exception e) {
        logger.error("custom proxylist read error :", e);
    } finally {
        if (dr != null) {
            try {
                dr.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (is != null) {
            try {
                is.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:org.apache.nutch.protocol.selenium.RegexURLFilter.java

License:Apache License

/**
 * Rules specified as a config property will override rules specified as a
 * config file./*from  w w w.  j ava  2 s  .c  om*/
 */
protected Reader getRulesReader(Configuration conf) throws IOException {
    String fileRules = conf.get(URLFILTER_REGEX_FILE);
    return conf.getConfResourceAsReader(fileRules);
}

From source file:org.apache.nutch.scoring.regex.RegexAnalysisScoringFilter.java

License:Apache License

public void setConf(Configuration conf) {
    this.conf = conf;
    String fileRules = conf.get("scoring.regex.file");
    Reader reader = conf.getConfResourceAsReader(fileRules);
    try {/*from   ww w . j a va 2  s  .  co m*/
        regexScoreMap = readRules(reader);
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error(e.getMessage());
        }
        throw new RuntimeException(e.getMessage(), e);
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.CosineSimilarityModel.java

License:Apache License

@Override
public void setConf(Configuration conf) {
    this.conf = conf;
    goldStandardDocPath = conf.get("scoring.similarity.model.path");
    Reader reader = conf.getConfResourceAsReader(goldStandardDocPath);
    try {//from ww w .  j  av a2  s.  c  om
        String fileContent = IOUtils.toString(reader);
        if (goldStandardDocVect == null) {
            goldStandardDocVect = new DocumentVector(fileContent, conf);
            LOG.info("Creating DocVector from path - {}", goldStandardDocPath);
        }
    } catch (IOException e) {
        LOG.error("Failed to create Document vector : {}", StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.DocumentVector.java

License:Apache License

private void populateStopWordSet(Configuration conf) {
    String stopWordFilePath = conf.get("scoring.similarity.stopword.file");
    Reader reader = conf.getConfResourceAsReader(stopWordFilePath);
    try {/*w ww  .java 2s.  c om*/
        LOG.info("Populating stopwords from {}", stopWordFilePath);
        String[] stopWordList1 = IOUtils.toString(reader).split("\n");
        for (String stopWord : stopWordList1) {
            stopWord = stopWord.trim();
            stopwordSet.add(stopWord);
        }
    } catch (IOException e) {
        LOG.error("Failed to populate stopwords : {}", StringUtils.stringifyException(e));
        e.printStackTrace();
    }
}

From source file:org.apache.nutch.scoring.similarity.cosine.Model.java

License:Apache License

public static synchronized void createModel(Configuration conf) throws IOException {
    if (isModelCreated) {
        LOG.info("Model exists, skipping model creation");
        return;//  w  w w  .j  a  v a 2 s  .  c  o  m
    }
    LOG.info("Creating Cosine model");
    try {
        //If user has specified a stopword file other than the template
        if (!conf.get("scoring.similarity.stopword.file").equals("stopwords.txt.template")) {
            stopWords = new ArrayList<String>();
            String stopWord;
            BufferedReader br = new BufferedReader(
                    conf.getConfResourceAsReader((conf.get("scoring.similarity.stopword.file"))));
            while ((stopWord = br.readLine()) != null) {
                stopWords.add(stopWord);
            }
            LOG.info("Loaded custom stopwords from {}", conf.get("scoring.similarity.stopword.file"));
        }

        int[] ngramArr = retrieveNgrams(conf);
        int mingram = ngramArr[0];
        int maxgram = ngramArr[1];
        LOG.info("Value of mingram: {} maxgram: {}", mingram, maxgram);

        // TODO : Allow for corpus of documents to be provided as gold standard. 
        String line;
        StringBuilder sb = new StringBuilder();
        BufferedReader br = new BufferedReader(
                conf.getConfResourceAsReader((conf.get("cosine.goldstandard.file"))));
        while ((line = br.readLine()) != null) {
            sb.append(line);
        }
        DocVector goldStandard = createDocVector(sb.toString(), mingram, maxgram);
        if (goldStandard != null)
            docVectors.add(goldStandard);
        else {
            throw new Exception("Could not create DocVector for goldstandard");
        }
    } catch (Exception e) {
        LOG.warn("Failed to add {} to model : {}",
                conf.get("cosine.goldstandard.file", "goldstandard.txt.template"),
                StringUtils.stringifyException(e));
    }
    if (docVectors.size() > 0) {
        LOG.info("Cosine model creation complete");
        isModelCreated = true;
    } else
        LOG.info("Cosine model creation failed");
}

From source file:org.apache.nutch.scoring.similarity.DocumentVector.java

License:Apache License

private void removeStopWords(Configuration conf) {
    String stopWordFilePath = conf.get("scoring.similarity.stopword.file");
    Reader reader = conf.getConfResourceAsReader(stopWordFilePath);
    try {//from  w w  w  .j  ava  2s .  co m
        String[] stopWordList1 = IOUtils.toString(reader).split("\n");
        for (String stopWord : stopWordList1) {
            stopWord = stopWord.trim();
            if (termFreqVect.containsKey(stopWord)) {
                termFreqVect.remove(stopWord);
            }
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}