List of usage examples for org.apache.hadoop.conf Configuration getFloat
public float getFloat(String name, float defaultValue)
name
property as a float
. From source file:org.apache.nutch.analysis.lang.LanguageQueryFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; setBoost(conf.getFloat("query.lang.boost", 0.0f)); }
From source file:org.apache.nutch.crawl.MimeAdaptiveFetchSchedule.java
License:Apache License
public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) return;//from w ww. java 2 s . c o m // Read and set the default INC and DEC rates in case we cannot set values based on MIME-type defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f); defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f); // Where's the mime/factor file? Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, "adaptive-mimetypes.txt")); try { readMimeFile(mimeFile); } catch (IOException e) { LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); } }
From source file:org.apache.nutch.fetcher.FetchItemQueues.java
License:Apache License
public FetchItemQueues(Configuration conf) { this.conf = conf; this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1); queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST); queueMode = checkQueueMode(queueMode); LOG.info("Using queue mode : " + queueMode); this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000); this.timelimit = conf.getLong("fetcher.timelimit", -1); this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1); }
From source file:org.apache.nutch.microformats.reltag.RelTagQueryFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; setBoost(conf.getFloat("query.tag.boost", 1.0f)); }
From source file:org.apache.nutch.scoring.link.LinkAnalysisScoringFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f); }
From source file:org.apache.nutch.scoring.opic.OPICScoringFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; scorePower = conf.getFloat("indexer.score.power", 0.5f); internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f); externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f); countFiltered = conf.getBoolean("db.score.count.filtered", false); }
From source file:org.apache.nutch.scoring.opic.TestOPICScoringFilter.java
License:Apache License
@Before public void setUp() throws Exception { Configuration conf = NutchConfiguration.create(); // LinkedHashMap dbWebPages is used instead of a persistent // data store for this test class Map<String, Map<WebPage, List<ScoreDatum>>> dbWebPages = new LinkedHashMap<String, Map<WebPage, List<ScoreDatum>>>(); // All WebPages stored in this map with an initial true value. // After processing, it is set to false. Map<String, Boolean> dbWebPagesControl = new LinkedHashMap<String, Boolean>(); TestOPICScoringFilter self = new TestOPICScoringFilter(); self.fillLinks();/*from w w w. j ava 2s .c o m*/ float scoreInjected = conf.getFloat("db.score.injected", 1.0f); scoringFilter = new OPICScoringFilter(); scoringFilter.setConf(conf); // injecting seed list, with scored attached to webpages for (String url : self.seedList) { WebPage row = WebPage.newBuilder().build(); row.setScore(scoreInjected); scoringFilter.injectedScore(url, row); List<ScoreDatum> scList = new LinkedList<ScoreDatum>(); Map<WebPage, List<ScoreDatum>> webPageMap = new HashMap<WebPage, List<ScoreDatum>>(); webPageMap.put(row, scList); dbWebPages.put(TableUtil.reverseUrl(url), webPageMap); dbWebPagesControl.put(TableUtil.reverseUrl(url), true); } // Depth Loop for (int i = 1; i <= DEPTH; i++) { Iterator<Map.Entry<String, Map<WebPage, List<ScoreDatum>>>> iter = dbWebPages.entrySet().iterator(); // OPIC Score calculated for each website one by one while (iter.hasNext()) { Map.Entry<String, Map<WebPage, List<ScoreDatum>>> entry = iter.next(); Map<WebPage, List<ScoreDatum>> webPageMap = entry.getValue(); WebPage row = null; List<ScoreDatum> scoreList = null; Iterator<Map.Entry<WebPage, List<ScoreDatum>>> iters = webPageMap.entrySet().iterator(); if (iters.hasNext()) { Map.Entry<WebPage, List<ScoreDatum>> values = iters.next(); row = values.getKey(); scoreList = values.getValue(); } String reverseUrl = entry.getKey(); String url = TableUtil.unreverseUrl(reverseUrl); float score = row.getScore(); if (dbWebPagesControl.get(TableUtil.reverseUrl(url))) { row.setScore(scoringFilter.generatorSortValue(url, row, score)); dbWebPagesControl.put(TableUtil.reverseUrl(url), false); } // getting outlinks from testdata String[] seedOutlinks = self.linkList.get(url); for (String seedOutlink : seedOutlinks) { row.getOutlinks().put(seedOutlink, ""); } self.outlinkedScoreData.clear(); // Existing outlinks are added to outlinkedScoreData Map<CharSequence, CharSequence> outlinks = row.getOutlinks(); if (outlinks != null) { for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) { int depth = Integer.MAX_VALUE; self.outlinkedScoreData .add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue().toString(), depth)); } } scoringFilter.distributeScoreToOutlinks(url, row, self.outlinkedScoreData, (outlinks == null ? 0 : outlinks.size())); // DbUpdate Reducer simulation for (ScoreDatum sc : self.outlinkedScoreData) { if (dbWebPages.get(TableUtil.reverseUrl(sc.getUrl())) == null) { // Check each outlink and creates new webpages if it's not // exist in database (dbWebPages) WebPage outlinkRow = WebPage.newBuilder().build(); scoringFilter.initialScore(sc.getUrl(), outlinkRow); List<ScoreDatum> newScoreList = new LinkedList<ScoreDatum>(); newScoreList.add(sc); Map<WebPage, List<ScoreDatum>> values = new HashMap<WebPage, List<ScoreDatum>>(); values.put(outlinkRow, newScoreList); dbWebPages.put(TableUtil.reverseUrl(sc.getUrl()), values); dbWebPagesControl.put(TableUtil.reverseUrl(sc.getUrl()), true); } else { // Outlinks are added to list for each webpage Map<WebPage, List<ScoreDatum>> values = dbWebPages.get(TableUtil.reverseUrl(sc.getUrl())); Iterator<Map.Entry<WebPage, List<ScoreDatum>>> value = values.entrySet().iterator(); if (value.hasNext()) { Map.Entry<WebPage, List<ScoreDatum>> list = value.next(); scoreList = list.getValue(); scoreList.add(sc); } } } } // Simulate Reducing for (Map.Entry<String, Map<WebPage, List<ScoreDatum>>> page : dbWebPages.entrySet()) { String reversedUrl = page.getKey(); String url = TableUtil.unreverseUrl(reversedUrl); Iterator<Map.Entry<WebPage, List<ScoreDatum>>> rr = page.getValue().entrySet().iterator(); List<ScoreDatum> inlinkedScoreDataList = null; WebPage row = null; if (rr.hasNext()) { Map.Entry<WebPage, List<ScoreDatum>> aa = rr.next(); inlinkedScoreDataList = aa.getValue(); row = aa.getKey(); } // Scores are updated here scoringFilter.updateScore(url, row, inlinkedScoreDataList); inlinkedScoreDataList.clear(); HashMap<String, Float> result = new HashMap<String, Float>(); result.put(url, row.getScore()); resultScores.put(i, result); } } }
From source file:org.apache.nutch.scoring.pagerank.PageRankScoringFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; scorePower = conf.getFloat("indexer.score.power", 0.5f); internalScoreFactor = conf.getFloat("db.score.link.internal", 1.0f); externalScoreFactor = conf.getFloat("db.score.link.external", 1.0f); keywordScoreFactor = conf.getFloat("db.score.science.keywords", 1.0f); countFiltered = conf.getBoolean("db.score.count.filtered", false); try {/*from ww w . jav a2 s .c o m*/ extractor = new LocationExtractor("IndexingDir"); } catch (Exception e) { extractor = null; } }
From source file:org.apache.nutch.searcher.basic.BasicQueryFilter.java
License:Apache License
public void setConf(Configuration conf) { this.conf = conf; this.FIELD_BOOSTS[URL_BOOST] = conf.getFloat("query.url.boost", 4.0f); this.FIELD_BOOSTS[ANCHOR_BOOST] = conf.getFloat("query.anchor.boost", 2.0f); this.FIELD_BOOSTS[CONTENT_BOOST] = conf.getFloat("query.content.boost", 1.0f); this.FIELD_BOOSTS[TITLE_BOOST] = conf.getFloat("query.title.boost", 1.5f); this.FIELD_BOOSTS[HOST_BOOST] = conf.getFloat("query.host.boost", 2.0f); this.PHRASE_BOOST = conf.getFloat("query.phrase.boost", 1.0f); }
From source file:org.apache.nutch.searcher.LuceneQueryOptimizer.java
License:Apache License
/** * Construct an optimizer that caches and uses filters for required clauses * whose boost is zero.//from www .j a v a 2 s. com * * @param cacheSize * the number of QueryFilters to cache * @param threshold * the fraction of documents which must contain a term */ public LuceneQueryOptimizer(Configuration conf) { final int cacheSize = conf.getInt("searcher.filter.cache.size", 16); this.threshold = conf.getFloat("searcher.filter.cache.threshold", 0.05f); this.tickLength = conf.getInt("searcher.max.time.tick_length", 200); this.maxTickCount = conf.getInt("searcher.max.time.tick_count", -1); this.maxFulltextMatchesRanked = conf.getInt(Global.MAX_FULLTEXT_MATCHES_RANKED, -1); this.timeoutResponse = conf.getInt(Global.TIMEOUT_INDEX_SERVERS_RESPONSE, -1); if (timeoutResponse > 0) { this.maxTickCount = timeoutResponse; this.tickLength = 1000; } if (this.maxTickCount > 0) { initTimerThread(this.tickLength); } }