Example usage for org.apache.hadoop.conf Configuration getLong

List of usage examples for org.apache.hadoop.conf Configuration getLong

Introduction

In this page you can find the example usage for org.apache.hadoop.conf Configuration getLong.

Prototype

public long getLong(String name, long defaultValue) 

Source Link

Document

Get the value of the name property as a long.

Usage

From source file:org.apache.mnemonic.hadoop.MneConfigHelper.java

License:Apache License

public static long getMemPoolSize(Configuration conf, String prefix) {
    return conf.getLong(getConfigName(prefix, MEM_POOL_SIZE), DEFAULT_OUTPUT_MEM_POOL_SIZE);
}

From source file:org.apache.mrql.Config.java

License:Apache License

/** load the configuration parameters */
public static void read(Configuration conf) {
    hadoop_mode = conf.getBoolean("mrql.hadoop.mode", hadoop_mode);
    local_mode = conf.getBoolean("mrql.local.mode", local_mode);
    distributed_mode = conf.getBoolean("mrql.distributed.mode", distributed_mode);
    map_reduce_mode = conf.getBoolean("mrql.map.reduce.mode", map_reduce_mode);
    bsp_mode = conf.getBoolean("mrql.bsp.mode", bsp_mode);
    spark_mode = conf.getBoolean("mrql.spark.mode", spark_mode);
    flink_mode = conf.getBoolean("mrql.flink.mode", flink_mode);
    interactive = conf.getBoolean("mrql.interactive", interactive);
    compile_functional_arguments = conf.getBoolean("mrql.compile.functional.arguments",
            compile_functional_arguments);
    trace = conf.getBoolean("mrql.trace", trace);
    nodes = conf.getInt("mrql.nodes", nodes);
    mapjoin_size = conf.getInt("mrql.mapjoin.size", mapjoin_size);
    map_cache_size = conf.getInt("mrql.in.mapper.size", map_cache_size);
    max_bag_size_print = conf.getInt("mrql.max.bag.size.print", max_bag_size_print);
    max_materialized_bag = conf.getLong("mrql.max.materialized.bag", max_materialized_bag);
    bsp_msg_size = conf.getInt("mrql.bsp.msg.size", bsp_msg_size);
    range_split_size = conf.getLong("mrql.range.split.size", range_split_size);
    max_merged_streams = conf.getInt("mrql.max.merged.streams", max_merged_streams);
    tmpDirectory = conf.get("mrql.tmp.directory");
    use_combiner = conf.getBoolean("mrql.use.combiner", use_combiner);
    groupJoinOpt = conf.getBoolean("mrql.group.join.opt", groupJoinOpt);
    selfJoinOpt = conf.getBoolean("mrql.self.join.opt", selfJoinOpt);
    trace_execution = conf.getBoolean("mrql.trace.execution", trace_execution);
    trace_exp_execution = conf.getBoolean("mrql.trace.exp.execution", trace_exp_execution);
    quiet_execution = conf.getBoolean("mrql.quiet.execution", quiet_execution);
    testing = conf.getBoolean("mrql.testing", testing);
    info = conf.getBoolean("mrql.info", info);
    stream_window = conf.getInt("mrql.stream.window", stream_window);
}

From source file:org.apache.nutch.crawl.GeneratorMapper.java

License:Apache License

@Override
public void setup(Context context) {
    Configuration conf = context.getConfiguration();
    filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true);
    normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true);
    if (filter) {
        filters = new URLFilters(conf);
    }//w w  w.j  ava  2s  .c o  m
    if (normalise) {
        normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
    }
    maxDistance = conf.getInt("generate.max.distance", -1);
    curTime = conf.getLong(GeneratorJob.GENERATOR_CUR_TIME, System.currentTimeMillis());
    schedule = FetchScheduleFactory.getFetchSchedule(conf);
    scoringFilters = new ScoringFilters(conf);
}

From source file:org.apache.nutch.crawl.GeneratorReducer.java

License:Apache License

@Override
protected void setup(Context context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    long totalLimit = conf.getLong(GeneratorJob.GENERATOR_TOP_N, Long.MAX_VALUE);
    if (totalLimit == Long.MAX_VALUE) {
        limit = Long.MAX_VALUE;/*from   w  w  w  .ja v  a2s  .c  o  m*/
    } else {
        limit = totalLimit / context.getNumReduceTasks();
    }
    maxCount = conf.getLong(GeneratorJob.GENERATOR_MAX_COUNT, -2);
    batchId = new Utf8(conf.get(GeneratorJob.BATCH_ID));
    String countMode = conf.get(GeneratorJob.GENERATOR_COUNT_MODE, GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
    if (countMode.equals(GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN)) {
        byDomain = true;
    }

}

From source file:org.apache.nutch.fetcher.FetcherReducer.java

License:Apache License

@Override
public void run(Context context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    this.fetchQueues = new FetchItemQueues(conf);
    int threadCount = conf.getInt("fetcher.threads.fetch", 10);
    parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
    storingContent = conf.getBoolean("fetcher.store.content", true);
    if (parse) {//w w w .  j  a  va2  s  .  c  om
        skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
        parseUtil = new ParseUtil(conf);
    }
    LOG.info("Fetcher: threads: " + threadCount);

    int maxFeedPerThread = conf.getInt("fetcher.queue.depth.multiplier", 50);
    feeder = new QueueFeeder(context, fetchQueues, threadCount * maxFeedPerThread);
    feeder.start();

    for (int i = 0; i < threadCount; i++) { // spawn threads
        FetcherThread ft = new FetcherThread(context, i);
        fetcherThreads.add(ft);
        ft.start();
    }
    // select a timeout that avoids a task timeout
    final long timeout = conf.getInt("mapred.task.timeout", 10 * 60 * 1000) / 2;

    // Used for threshold check, holds pages and bytes processed in the last sec
    float pagesLastSec;
    int bytesLastSec;

    int throughputThresholdCurrentSequence = 0;

    int throughputThresholdPages = conf.getInt("fetcher.throughput.threshold.pages", -1);
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: throughput threshold: " + throughputThresholdPages);
    }
    int throughputThresholdSequence = conf.getInt("fetcher.throughput.threshold.sequence", 5);
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: throughput threshold sequence: " + throughputThresholdSequence);
    }
    long throughputThresholdTimeLimit = conf.getLong("fetcher.throughput.threshold.check.after", -1);

    do { // wait for threads to exit
        pagesLastSec = pages.get();
        bytesLastSec = (int) bytes.get();
        final int secondsToSleep = 5;
        try {
            Thread.sleep(secondsToSleep * 1000);
        } catch (InterruptedException e) {
        }

        pagesLastSec = (pages.get() - pagesLastSec) / secondsToSleep;
        bytesLastSec = ((int) bytes.get() - bytesLastSec) / secondsToSleep;

        int fetchQueuesTotalSize = fetchQueues.getTotalSize();
        reportAndLogStatus(context, pagesLastSec, bytesLastSec, fetchQueuesTotalSize);

        boolean feederAlive = feeder.isAlive();
        if (!feederAlive && fetchQueuesTotalSize < 5) {
            fetchQueues.dump();
        }

        // check timelimit
        if (!feederAlive) {
            int hitByTimeLimit = fetchQueues.checkTimelimit();
            if (hitByTimeLimit != 0) {
                context.getCounter("FetcherStatus", "HitByTimeLimit-Queues").increment(hitByTimeLimit);
            }
        }

        // if throughput threshold is enabled
        if (throughputThresholdTimeLimit < System.currentTimeMillis() && throughputThresholdPages != -1) {
            // Check if we're dropping below the threshold
            if (pagesLastSec < throughputThresholdPages) {
                throughputThresholdCurrentSequence++;
                LOG.warn(Integer.toString(throughputThresholdCurrentSequence)
                        + ": dropping below configured threshold of "
                        + Integer.toString(throughputThresholdPages) + " pages per second");

                // Quit if we dropped below threshold too many times
                if (throughputThresholdCurrentSequence > throughputThresholdSequence) {
                    LOG.warn("Dropped below threshold too many times in a row, killing!");

                    // Disable the threshold checker
                    throughputThresholdPages = -1;

                    // Empty the queues cleanly and get number of items that were
                    // dropped
                    int hitByThrougputThreshold = fetchQueues.emptyQueues();

                    if (hitByThrougputThreshold != 0)
                        context.getCounter("FetcherStatus", "hitByThrougputThreshold")
                                .increment(hitByThrougputThreshold);
                }
            } else {
                throughputThresholdCurrentSequence = 0;
            }
        }

        // some requests seem to hang, despite all intentions
        if ((System.currentTimeMillis() - lastRequestStart.get()) > timeout) {
            if (LOG.isWarnEnabled() && activeThreads.get() > 0) {
                LOG.warn("Aborting with " + activeThreads + " hung threads.");
                for (int i = 0; i < fetcherThreads.size(); i++) {
                    FetcherThread thread = fetcherThreads.get(i);
                    if (thread.isAlive()) {
                        LOG.warn("Thread #" + i + " hung while processing " + thread.reprUrl);
                        if (LOG.isDebugEnabled()) {
                            StackTraceElement[] stack = thread.getStackTrace();
                            StringBuilder sb = new StringBuilder();
                            sb.append("Stack of thread #").append(i).append(":\n");
                            for (StackTraceElement s : stack) {
                                sb.append(s.toString()).append('\n');
                            }
                            LOG.debug(sb.toString());
                        }
                    }
                }
            }
            return;
        }

    } while (activeThreads.get() > 0);
    LOG.info("-activeThreads=" + activeThreads);
}

From source file:org.apache.nutch.fetcher.FetchItemQueues.java

License:Apache License

public FetchItemQueues(Configuration conf) {
    this.conf = conf;
    this.maxThreads = conf.getInt("fetcher.threads.per.queue", 1);
    queueMode = conf.get("fetcher.queue.mode", QUEUE_MODE_HOST);
    queueMode = checkQueueMode(queueMode);
    LOG.info("Using queue mode : " + queueMode);

    this.crawlDelay = (long) (conf.getFloat("fetcher.server.delay", 1.0f) * 1000);
    this.minCrawlDelay = (long) (conf.getFloat("fetcher.server.min.delay", 0.0f) * 1000);
    this.timelimit = conf.getLong("fetcher.timelimit", -1);
    this.maxExceptionsPerQueue = conf.getInt("fetcher.max.exceptions.per.queue", -1);
}

From source file:org.apache.nutch.host.HostDb.java

License:Apache License

public HostDb(Configuration conf) throws GoraException {
    try {//  w w  w.  j  av  a  2  s.  com
        hostStore = StorageUtils.createWebStore(conf, String.class, Host.class);
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e);
    }

    // Create a cache.
    // We add a removal listener to see if we need to flush the store,
    // in order to adhere to the put-flush-get semantic
    // ("read your own write") of DataStore.

    long lruSize = conf.getLong(HOSTDB_LRU_SIZE, DEFAULT_LRU_SIZE);
    int concurrencyLevel = conf.getInt(HOSTDB_CONCURRENCY_LEVEL, DEFAULT_HOSTDB_CONCURRENCY_LEVEL);
    RemovalListener<String, CacheHost> listener = new RemovalListener<String, CacheHost>() {
        @Override
        public void onRemoval(RemovalNotification<String, CacheHost> notification) {
            CacheHost removeFromCacheHost = notification.getValue();
            if (removeFromCacheHost != NULL_HOST) {
                if (removeFromCacheHost.timestamp < lastFlush.get()) {
                    try {
                        hostStore.flush();
                    } catch (Exception e) {
                        throw new RuntimeException(e);
                    }
                    lastFlush.set(System.currentTimeMillis());
                }
            }
        }
    };

    cache = CacheBuilder.newBuilder().maximumSize(lruSize).removalListener(listener)
            .concurrencyLevel(concurrencyLevel).build();
    lastFlush = new AtomicLong(System.currentTimeMillis());
}

From source file:org.apache.nutch.protocol.htmlunit.HtmlUnitWebDriver.java

License:Apache License

public static WebDriver getDriverForPage(String url, Configuration conf) {
    long pageLoadTimout = conf.getLong("page.load.delay", 3);
    enableJavascript = conf.getBoolean("htmlunit.enable.javascript", true);
    enableCss = conf.getBoolean("htmlunit.enable.css", false);
    javascriptTimeout = conf.getLong("htmlunit.javascript.timeout", 3500);
    int redirects = Integer.parseInt(conf.get("http.redirect.max", "0"));
    enableRedirect = redirects <= 0 ? false : true;
    maxRedirects = redirects;/*from   www . j a v  a  2s .  c  o m*/

    WebDriver driver = null;

    try {
        driver = new HtmlUnitWebDriver();
        driver.manage().timeouts().pageLoadTimeout(pageLoadTimout, TimeUnit.SECONDS);
        driver.get(url);
    } catch (Exception e) {
        if (e instanceof TimeoutException) {
            LOG.debug("HtmlUnit WebDriver: Timeout Exception: Capturing whatever loaded so far...");
            return driver;
        }
        cleanUpDriver(driver);
        throw new RuntimeException(e);
    }

    return driver;
}

From source file:org.apache.nutch.protocol.interactiveselenium.DefaultClickAllAjaxLinksHandler.java

License:Apache License

public String processDriver(WebDriver driver) {

    String accumulatedData = "";
    try {//from  w  w w  . ja  v  a  2 s .co  m

        driver.findElement(By.tagName("body")).getAttribute("innerHTML");
        Configuration conf = NutchConfiguration.create();
        new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3));

        List<WebElement> atags = driver.findElements(By.tagName("a"));
        int numberofajaxlinks = atags.size();
        for (int i = 0; i < numberofajaxlinks; i++) {

            if (atags.get(i).getAttribute("href") != null
                    && atags.get(i).getAttribute("href").equals("javascript:void(null);")) {

                atags.get(i).click();

                if (i == numberofajaxlinks - 1) {
                    // append everything to the driver in the last round
                    JavascriptExecutor jsx = (JavascriptExecutor) driver;
                    jsx.executeScript(
                            "document.body.innerHTML=document.body.innerHTML " + accumulatedData + ";");
                    continue;
                }

                accumulatedData += driver.findElement(By.tagName("body")).getAttribute("innerHTML");

                // refreshing the handlers as the page was interacted with
                driver.navigate().refresh();
                new WebDriverWait(driver, conf.getLong("libselenium.page.load.delay", 3));
                atags = driver.findElements(By.tagName("a"));
            }
        }
    } catch (Exception e) {
        LOG.info(StringUtils.stringifyException(e));
    }
    return accumulatedData;
}

From source file:org.apache.nutch.protocol.selenium.HttpWebClient.java

License:Apache License

public static WebDriver getDriverForPage(String url, Configuration conf) {
    WebDriver driver = null;//from  ww w  .j av a  2 s.c  o  m
    long pageLoadWait = conf.getLong("page.load.delay", 3);

    try {
        String driverType = conf.get("selenium.driver", "firefox");
        boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless", false);

        switch (driverType) {
        case "firefox":
            String geckoDriverPath = conf.get("selenium.grid.binary", "/root/geckodriver");
            driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode);
            break;
        case "chrome":
            String chromeDriverPath = conf.get("selenium.grid.binary", "/root/chromedriver");
            driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode);
            break;
        // case "opera":
        // // This class is provided as a convenience for easily testing the
        // Chrome browser.
        // String operaDriverPath = conf.get("selenium.grid.binary",
        // "/root/operadriver");
        // driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode);
        // break;
        case "remote":
            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
            URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort,
                    seleniumHubPath);

            String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox");

            switch (seleniumGridDriver) {
            case "firefox":
                driver = createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                break;
            case "chrome":
                driver = createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                break;
            case "random":
                driver = createRandomRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                break;
            default:
                LOG.error(
                        "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().",
                        driverType);
                driver = createDefaultRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
                break;
            }
            break;
        default:
            LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().",
                    driverType);
            FirefoxOptions options = new FirefoxOptions();
            driver = new FirefoxDriver(options);
            break;
        }
        LOG.debug("Selenium {} WebDriver selected.", driverType);

        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
        driver.get(url);
    } catch (Exception e) {
        if (e instanceof TimeoutException) {
            LOG.error("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
            return driver;
        } else {
            LOG.error(e.toString());
        }
        cleanUpDriver(driver);
        throw new RuntimeException(e);
    }

    return driver;
}