List of usage examples for org.apache.hadoop.conf Configuration getLong
public long getLong(String name, long defaultValue)
name
property as a long
. From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java
License:Apache License
public static long getMemPoolSize(Configuration conf, String prefix) { return conf.getLong(getConfigName(prefix, MEM_POOL_SIZE), DEFAULT_OUTPUT_MEM_POOL_SIZE); }
From source file:org.apache.mrql.Config.java
License:Apache License
/** load the configuration parameters */ public static void read(Configuration conf) { hadoop_mode = conf.getBoolean("mrql.hadoop.mode", hadoop_mode); local_mode = conf.getBoolean("mrql.local.mode", local_mode); distributed_mode = conf.getBoolean("mrql.distributed.mode", distributed_mode); map_reduce_mode = conf.getBoolean("mrql.map.reduce.mode", map_reduce_mode); bsp_mode = conf.getBoolean("mrql.bsp.mode", bsp_mode); spark_mode = conf.getBoolean("mrql.spark.mode", spark_mode); flink_mode = conf.getBoolean("mrql.flink.mode", flink_mode); interactive = conf.getBoolean("mrql.interactive", interactive); compile_functional_arguments = conf.getBoolean("mrql.compile.functional.arguments", compile_functional_arguments); trace = conf.getBoolean("mrql.trace", trace); nodes = conf.getInt("mrql.nodes", nodes); mapjoin_size = conf.getInt("mrql.mapjoin.size", mapjoin_size); map_cache_size = conf.getInt("mrql.in.mapper.size", map_cache_size); max_bag_size_print = conf.getInt("mrql.max.bag.size.print", max_bag_size_print); max_materialized_bag = conf.getLong("mrql.max.materialized.bag", max_materialized_bag); bsp_msg_size = conf.getInt("mrql.bsp.msg.size", bsp_msg_size); range_split_size = conf.getLong("mrql.range.split.size", range_split_size); max_merged_streams = conf.getInt("mrql.max.merged.streams", max_merged_streams); tmpDirectory = conf.get("mrql.tmp.directory"); use_combiner = conf.getBoolean("mrql.use.combiner", use_combiner); groupJoinOpt = conf.getBoolean("mrql.group.join.opt", groupJoinOpt); selfJoinOpt = conf.getBoolean("mrql.self.join.opt", selfJoinOpt); trace_execution = conf.getBoolean("mrql.trace.execution", trace_execution); trace_exp_execution = conf.getBoolean("mrql.trace.exp.execution", trace_exp_execution); quiet_execution = conf.getBoolean("mrql.quiet.execution", quiet_execution); testing = conf.getBoolean("mrql.testing", testing); info = conf.getBoolean("mrql.info", info); stream_window = conf.getInt("mrql.stream.window", stream_window); }
From source file:org.apache.nutch.crawl.GeneratorMapper.java
License:Apache License
@Override public void setup(Context context) { Configuration conf = context.getConfiguration(); filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true); normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true); if (filter) { filters = new URLFilters(conf); }//w w w.j ava 2s .c o m if (normalise) { normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); } maxDistance = conf.getInt("generate.max.distance", -1); curTime = conf.getLong(GeneratorJob.GENERATOR_CUR_TIME, System.currentTimeMillis()); schedule = FetchScheduleFactory.getFetchSchedule(conf); scoringFilters = new ScoringFilters(conf); }
From source file:org.apache.nutch.crawl.GeneratorReducer.java
License:Apache License
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); long totalLimit = conf.getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE); if (totalLimit == Long.MAX_VALUE) { limit = Long.MAX_VALUE;/*from w w w .ja v a2s .c o m*/ } else { limit = totalLimit / context.getNumReduceTasks(); } maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2); batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID)); String countMode = conf.get(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_HOST); if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) { byDomain = true; } }
From source file:org.apache.nutch.fetcher.FetcherReducer.java
License:Apache License
@Override public void run(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); this.fetchQueues = new FetchItemQueues(conf); int threadCount = conf.getInt("fetcher.threads.fetch", 10); parse = conf.getBoolean(FetcherJob.PARSE_KEY, false); storingContent = conf.getBoolean("fetcher.store.content", true); if (parse) {//w w w . j a va2 s . c om skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true); parseUtil = new ParseUtil(conf); } LOG.info("Fetcher: threads: " + threadCount); int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50); feeder = new QueueFeeder(context, fetchQueues, threadCount * maxFeedPerThread); feeder.start(); for (int i = 0; i < threadCount; i++) { // spawn threads FetcherThread ft = new FetcherThread(context, i); fetcherThreads.add(ft); ft.start(); } // select a timeout that avoids a task timeout final long timeout = conf.getInt("mapred.task.timeout", 10 * 60 * 1000) / 2; // Used for threshold check, holds pages and bytes processed in the last sec float pagesLastSec; int bytesLastSec; int throughputThresholdCurrentSequence = 0; int throughputThresholdPages = conf.getInt("fetcher.throughput.threshold.pages", -1); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages); } int throughputThresholdSequence = conf.getInt("fetcher.throughput.threshold.sequence", 5); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: throughput threshold sequence: " + throughputThresholdSequence); } long throughputThresholdTimeLimit = conf.getLong("fetcher.throughput.threshold.check.after", -1); do { // wait for threads to exit pagesLastSec = pages.get(); bytesLastSec = (int) bytes.get(); final int secondsToSleep = 5; try { Thread.sleep(secondsToSleep * 1000); } catch (InterruptedException e) { } pagesLastSec = (pages.get() - pagesLastSec) / secondsToSleep; bytesLastSec = ((int) bytes.get() - bytesLastSec) / secondsToSleep; int fetchQueuesTotalSize = fetchQueues.getTotalSize(); reportAndLogStatus(context, pagesLastSec, bytesLastSec, fetchQueuesTotalSize); boolean feederAlive = feeder.isAlive(); if (!feederAlive && fetchQueuesTotalSize < 5) { fetchQueues.dump(); } // check timelimit if (!feederAlive) { int hitByTimeLimit = fetchQueues.checkTimelimit(); if (hitByTimeLimit != 0) { context.getCounter("FetcherStatus", "HitByTimeLimit-Queues").increment(hitByTimeLimit); } } // if throughput threshold is enabled if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) { // Check if we're dropping below the threshold if (pagesLastSec < throughputThresholdPages) { throughputThresholdCurrentSequence++; LOG.warn(Integer.toString(throughputThresholdCurrentSequence) + ": dropping below configured threshold of " + Integer.toString(throughputThresholdPages) + " pages per second"); // Quit if we dropped below threshold too many times if (throughputThresholdCurrentSequence > throughputThresholdSequence) { LOG.warn("Dropped below threshold too many times in a row, killing!"); // Disable the threshold checker throughputThresholdPages = -1; // Empty the queues cleanly and get number of items that were // dropped int hitByThrougputThreshold = fetchQueues.emptyQueues(); if (hitByThrougputThreshold != 0) context.getCounter("FetcherStatus", "hitByThrougputThreshold") .increment(hitByThrougputThreshold); } } else { throughputThresholdCurrentSequence = 0; } } // some requests seem to hang, despite all intentions if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) { if (LOG.isWarnEnabled() && activeThreads.get() > 0) { LOG.warn("Aborting with " + activeThreads + " hung threads."); for (int i = 0; i < fetcherThreads.size(); i++) { FetcherThread thread = fetcherThreads.get(i); if (thread.isAlive()) { LOG.warn("Thread #" + i + " hung while processing " + thread.reprUrl); if (LOG.isDebugEnabled()) { StackTraceElement[] stack = thread.getStackTrace(); StringBuilder sb = new StringBuilder(); sb.append("Stack of thread #").append(i).append(":\n"); for (StackTraceElement s : stack) { sb.append(s.toString()).append('\n'); } LOG.debug(sb.toString()); } } } } return; } } while (activeThreads.get() > 0); LOG.info("-activeThreads=" + activeThreads); }
From source file:org.apache.nutch.fetcher.FetchItemQueues.java
License:Apache License
public FetchItemQueues(Configuration conf) { this.conf = conf; this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1); queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST); queueMode = checkQueueMode(queueMode); LOG.info("Using queue mode : " + queueMode); this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000); this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000); this.timelimit = conf.getLong("fetcher.timelimit", -1); this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1); }
From source file:org.apache.nutch.host.HostDb.java
License:Apache License
public HostDb(Configuration conf) throws GoraException { try {// w w w. j av a 2 s. com hostStore = StorageUtils.createWebStore(conf, String.class, Host.class); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } // Create a cache. // We add a removal listener to see if we need to flush the store, // in order to adhere to the put-flush-get semantic // ("read your own write") of DataStore. long lruSize = conf.getLong(HOSTDB_LRU_SIZE, DEFAULT_LRU_SIZE); int concurrencyLevel = conf.getInt(HOSTDB_CONCURRENCY_LEVEL, DEFAULT_HOSTDB_CONCURRENCY_LEVEL); RemovalListener<String, CacheHost> listener = new RemovalListener<String, CacheHost>() { @Override public void onRemoval(RemovalNotification<String, CacheHost> notification) { CacheHost removeFromCacheHost = notification.getValue(); if (removeFromCacheHost != NULL_HOST) { if (removeFromCacheHost.timestamp < lastFlush.get()) { try { hostStore.flush(); } catch (Exception e) { throw new RuntimeException(e); } lastFlush.set(System.currentTimeMillis()); } } } }; cache = CacheBuilder.newBuilder().maximumSize(lruSize).removalListener(listener) .concurrencyLevel(concurrencyLevel).build(); lastFlush = new AtomicLong(System.currentTimeMillis()); }
From source file:org.apache.nutch.protocol.htmlunit.HtmlUnitWebDriver.java
License:Apache License
public static WebDriver getDriverForPage(String url, Configuration conf) { long pageLoadTimout = conf.getLong("page.load.delay", 3); enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true); enableCss = conf.getBoolean("htmlunit.enable.css", false); javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500); int redirects = Integer.parseInt(conf.get("http.redirect.max", "0")); enableRedirect = redirects <= 0 ? false : true; maxRedirects = redirects;/*from www . j a v a 2s . c o m*/ WebDriver driver = null; try { driver = new HtmlUnitWebDriver(); driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS); driver.get(url); } catch (Exception e) { if (e instanceof TimeoutException) { LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far..."); return driver; } cleanUpDriver(driver); throw new RuntimeException(e); } return driver; }
From source file:org.apache.nutch.protocol.interactiveselenium.DefaultClickAllAjaxLinksHandler.java
License:Apache License
public String processDriver(WebDriver driver) { String accumulatedData = ""; try {//from w w w . ja v a 2 s .co m driver.findElement(By.tagName("body")).getAttribute("innerHTML"); Configuration conf = NutchConfiguration.create(); new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3)); List<WebElement> atags = driver.findElements(By.tagName("a")); int numberofajaxlinks = atags.size(); for (int i = 0; i < numberofajaxlinks; i++) { if (atags.get(i).getAttribute("href") != null && atags.get(i).getAttribute("href").equals("javascript:void(null);")) { atags.get(i).click(); if (i == numberofajaxlinks - 1) { // append everything to the driver in the last round JavascriptExecutor jsx = (JavascriptExecutor) driver; jsx.executeScript( "document.body.innerHTML=document.body.innerHTML " + accumulatedData + ";"); continue; } accumulatedData += driver.findElement(By.tagName("body")).getAttribute("innerHTML"); // refreshing the handlers as the page was interacted with driver.navigate().refresh(); new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3)); atags = driver.findElements(By.tagName("a")); } } } catch (Exception e) { LOG.info(StringUtils.stringifyException(e)); } return accumulatedData; }
From source file:org.apache.nutch.protocol.selenium.HttpWebClient.java
License:Apache License
public static WebDriver getDriverForPage(String url, Configuration conf) { WebDriver driver = null;//from ww w .j av a 2 s.c o m long pageLoadWait = conf.getLong("page.load.delay", 3); try { String driverType = conf.get("selenium.driver", "firefox"); boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless", false); switch (driverType) { case "firefox": String geckoDriverPath = conf.get("selenium.grid.binary", "/root/geckodriver"); driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode); break; case "chrome": String chromeDriverPath = conf.get("selenium.grid.binary", "/root/chromedriver"); driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode); break; // case "opera": // // This class is provided as a convenience for easily testing the // Chrome browser. // String operaDriverPath = conf.get("selenium.grid.binary", // "/root/operadriver"); // driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode); // break; case "remote": String seleniumHubHost = conf.get("selenium.hub.host", "localhost"); int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444")); String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub"); String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http"); URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath); String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox"); switch (seleniumGridDriver) { case "firefox": driver = createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); break; case "chrome": driver = createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); break; case "random": driver = createRandomRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); break; default: LOG.error( "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); driver = createDefaultRemoteWebDriver(seleniumHubUrl, enableHeadlessMode); break; } break; default: LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType); FirefoxOptions options = new FirefoxOptions(); driver = new FirefoxDriver(options); break; } LOG.debug("Selenium {} WebDriver selected.", driverType); driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS); driver.get(url); } catch (Exception e) { if (e instanceof TimeoutException) { LOG.error("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far..."); return driver; } else { LOG.error(e.toString()); } cleanUpDriver(driver); throw new RuntimeException(e); } return driver; }