Java tutorial
/* * Copyright 2008 FatWire Corporation. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.fatwire.dta.sscrawler; import java.io.File; import java.net.URI; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.concurrent.ThreadPoolExecutor; import javax.management.MBeanServer; import javax.management.ObjectName; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.Option; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.httpclient.ProxyHost; import org.apache.commons.httpclient.UsernamePasswordCredentials; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.LogFactory; import org.apache.log4j.xml.DOMConfigurator; import com.fatwire.dta.sscrawler.domain.HostConfig; import com.fatwire.dta.sscrawler.reporting.Reporter; import com.fatwire.dta.sscrawler.reporting.reporters.DefaultArgumentsAsPageCriteriaReporter; import com.fatwire.dta.sscrawler.reporting.reporters.InnerLinkReporter; import com.fatwire.dta.sscrawler.reporting.reporters.InnerPageletPerOuterReporter; import com.fatwire.dta.sscrawler.reporting.reporters.NestingReporter; import com.fatwire.dta.sscrawler.reporting.reporters.Non200ResponseCodeReporter; import com.fatwire.dta.sscrawler.reporting.reporters.NotCachedReporter; import com.fatwire.dta.sscrawler.reporting.reporters.NumberOfInnerPageletsReporter; import com.fatwire.dta.sscrawler.reporting.reporters.OuterLinkCollectingReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageCollectingReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageCriteriaReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageRenderTimeReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageletOnlyReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageletReuseReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageletTimingsStatisticsReporter; import com.fatwire.dta.sscrawler.reporting.reporters.PageletUriCollectingReporter; import com.fatwire.dta.sscrawler.reporting.reporters.RootElementReporter; import com.fatwire.dta.sscrawler.reporting.reporters.SameContentPageletReporter; import com.fatwire.dta.sscrawler.reporting.reporters.SummaryReporter; import com.fatwire.dta.sscrawler.reporting.reporters.SuspiciousContextParamReporter; import com.fatwire.dta.sscrawler.reporting.reports.FileReport; import com.fatwire.dta.sscrawler.util.SSUriHelper; import com.fatwire.dta.sscrawler.util.UriHelperFactory; public class App { @SuppressWarnings("static-access") public static Options setUpCmd() { final Options options = new Options(); options.addOption("h", "help", false, "print this message."); final Option reportDir = OptionBuilder.withArgName("dir").hasArg() .withDescription("Directory where reports are stored").withLongOpt("reportDir").create("d"); options.addOption(reportDir); final Option max = OptionBuilder.withArgName("num").hasArg() .withDescription("Maximum number of pages, default is unlimited").withLongOpt("max").create("m"); options.addOption(max); final Option uriHelperFactory = OptionBuilder.withArgName("classname").hasArg() .withDescription("Class for constructing urls").withLongOpt("uriHelperFactory").create("f"); uriHelperFactory.setType(UriHelperFactory.class); options.addOption(uriHelperFactory); final Option threads = OptionBuilder.withArgName("num").hasArg() .withDescription("Number of concurrent threads that are reading from ContentServer") .withLongOpt("threads").create("t"); options.addOption(threads); final Option proxyUsername = OptionBuilder.withArgName("username").hasArg() .withDescription("Proxy Username").withLongOpt("proxyUsername").create("pu"); options.addOption(proxyUsername); final Option proxyPassword = OptionBuilder.withArgName("password").hasArg() .withDescription("Proxy Password").withLongOpt("proxyPassword").create("pw"); options.addOption(proxyPassword); final Option proxyHost = OptionBuilder.withArgName("host").hasArg().withDescription("Proxy hostname") .withLongOpt("proxyHost").create("ph"); options.addOption(proxyHost); final Option proxyPort = OptionBuilder.withArgName("port").hasArg().withDescription("Proxy port number") .withLongOpt("proxyPort").create("pp"); options.addOption(proxyPort); return options; } /** * @param args * @throws Exception */ public static void main(final String[] args) throws Exception { if (args.length < 1) { printUsage(); System.exit(1); } DOMConfigurator.configure("conf/log4j.xml"); final Options o = App.setUpCmd(); final CommandLineParser p = new BasicParser(); try { final CommandLine s = p.parse(o, args); if (s.hasOption('h')) { printUsage(); } else if (s.getArgList().contains("crawler") && s.getArgList().size() > 1) { new App().doWork(s); } else if (s.getArgList().contains("warmer") && s.getArgList().size() > 1) { new CacheWarmer().doWork(s); } else { System.err.println("no subcommand and/or URI found on " + s.getArgList()); printUsage(); System.exit(1); } } catch (final ParseException e) { System.err.println(e.getMessage()); printUsage(); System.exit(1); } } public static void printUsage() { final Options options = App.setUpCmd(); final HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("java " + App.class.getName() + " <subcommand> [options] [argument]\n", "Tool to retrieve pages from ContentServer as-if SatelliteServer is rendering them.\nVarious reports on the implemented caching strategy are produced." + "Argument: the start uri in the form of 'http://localhost:8080/cs/ContentServer?pagename=...'.\n" + "Available subcommands:\n crawler: extensive reporting on the discovered pagelets.\n warmer: warm the cache.\n\n", options, "For more into see http://www.nl.fatwire.com/dta/ss-crawler/", true); } private HostConfig createHostConfig(final URI uri) { final HostConfig hostConfig = new HostConfig(); hostConfig.setHostname(uri.getHost()); hostConfig.setPort(uri.getPort() == -1 ? 80 : uri.getPort()); hostConfig.setDomain(uri.getPath()); hostConfig.setProtocol(uri.getScheme()); return hostConfig; } protected void doWork(final CommandLine cmd) throws Exception { final Crawler crawler = new Crawler(); URI startUri = null; startUri = URI.create(cmd.getArgs()[1]); if (cmd.hasOption('m')) { crawler.setMaxPages(Integer.parseInt(cmd.getOptionValue('m'))); } final int threads = Integer.parseInt(cmd.getOptionValue('t', "5")); if (startUri == null) { throw new IllegalArgumentException("startUri is not set"); } final int t = startUri.toASCIIString().indexOf("/ContentServer"); if (t == -1) { throw new IllegalArgumentException("/ContentServer is not found on the startUri."); } crawler.setStartUri(new URI(null, null, null, -1, startUri.getRawPath(), startUri.getRawQuery(), startUri.getFragment())); final HostConfig hc = createHostConfig(URI.create(startUri.toASCIIString().substring(0, t))); final String proxyUsername = cmd.getOptionValue("pu"); final String proxyPassword = cmd.getOptionValue("pw"); final String proxyHost = cmd.getOptionValue("ph"); final int proxyPort = Integer.parseInt(cmd.getOptionValue("", "8080")); if (StringUtils.isNotBlank(proxyUsername) && StringUtils.isNotBlank(proxyUsername)) { hc.setProxyCredentials(new UsernamePasswordCredentials(proxyUsername, proxyPassword)); } if (StringUtils.isNotBlank(proxyHost)) { hc.setProxyHost(new ProxyHost(proxyHost, proxyPort)); } else if (StringUtils.isNotBlank(System.getProperty("http.proxyhost")) && StringUtils.isNotBlank(System.getProperty("http.proxyport"))) { hc.setProxyHost(new ProxyHost(System.getProperty("http.proxyhost"), Integer.parseInt(System.getProperty("http.proxyport")))); } crawler.setHostConfig(hc); SSUriHelper helper = null; if (cmd.hasOption('f')) { final UriHelperFactory f = (UriHelperFactory) Class.forName(cmd.getOptionValue('f')).newInstance(); helper = f.create(crawler.getStartUri().getPath()); } else { helper = new SSUriHelper(crawler.getStartUri().getPath()); } final ThreadPoolExecutor readerPool = new RenderingThreadPool(threads); final MBeanServer platform = java.lang.management.ManagementFactory.getPlatformMBeanServer(); try { platform.registerMBean(readerPool, new ObjectName("com.fatwire.crawler:name=readerpool")); } catch (final Throwable x) { LogFactory.getLog(App.class).error(x.getMessage(), x); } crawler.setExecutor(readerPool); File path = null; if (cmd.hasOption('d')) { path = new File(cmd.getOptionValue("d")); } else { path = getOutputDir(); } if (path != null) { final SimpleDateFormat df = new SimpleDateFormat("yyyyMMdd_HHmm"); path = new File(path, df.format(new Date())); path.mkdirs(); } crawler.setReporters(createReporters(path, helper)); crawler.setUriHelper(helper); try { crawler.work(); } finally { readerPool.shutdown(); try { platform.unregisterMBean(new ObjectName("com.fatwire.crawler:name=readerpool")); } catch (final Throwable x) { LogFactory.getLog(App.class).error(x.getMessage(), x); } } } protected File getOutputDir() { final File outputDir = new File("./reports"); outputDir.mkdirs(); return outputDir; } protected List<Reporter> createReporters(final File outputDir, final SSUriHelper helper) { final List<Reporter> reporters = new ArrayList<Reporter>(); reporters.add(new PageletUriCollectingReporter(new FileReport(outputDir, "pagelets.tsv", '\t'))); reporters.add(new PageCollectingReporter(new File(outputDir, "pages"))); reporters.add( new OuterLinkCollectingReporter(new FileReport(outputDir, "browsable-links.tsv", '\t'), helper)); reporters.add(new InnerLinkReporter(new FileReport(outputDir, "inner-links.tsv", '\t'))); reporters.add(new PageletTimingsStatisticsReporter(new FileReport(outputDir, "pagelet-stats.tsv", '\t'))); reporters.add(new PageRenderTimeReporter(new FileReport(outputDir, "pagelet-timings.tsv", '\t'))); reporters.add(new PageCriteriaReporter(new FileReport(outputDir, "pagecriteria.tsv", '\t'))); reporters.add(new RootElementReporter(new FileReport(outputDir, "root-elements.tsv", '\t'))); reporters.add(new Non200ResponseCodeReporter(new FileReport(outputDir, "non-200-repsonse.tsv", '\t'))); reporters.add(new InnerPageletPerOuterReporter(new FileReport(outputDir, "inner-pagelets.tsv", '\t'))); reporters.add( new NumberOfInnerPageletsReporter(new FileReport(outputDir, "num-inner-pagelets.tsv", '\t'), 8)); reporters.add(new NestingReporter(new FileReport(outputDir, "nesting.tsv", '\t'), 8, 5)); reporters.add(new PageletOnlyReporter(new FileReport(outputDir, "pagelet-only.tsv", '\t'))); reporters.add(new NotCachedReporter(new FileReport(outputDir, "not-cached.tsv", '\t'))); reporters.add(new DefaultArgumentsAsPageCriteriaReporter( new FileReport(outputDir, "defaultArguments-as-pagecriteria.tsv", '\t'))); reporters.add(new SameContentPageletReporter(new FileReport(outputDir, "same-content-pagelet.tsv", '\t'))); reporters.add(new PageletReuseReporter(new FileReport(outputDir, "pagelet-reuse.tsv", '\t'), 5)); reporters.add(new SuspiciousContextParamReporter( new FileReport(outputDir, "suspicious-context-parameters.tsv", '\t'))); reporters.add(new SummaryReporter(new FileReport(outputDir, "summary.txt", '\t'), reporters)); /* * TODO - inner uncached pagelets, is inner uncached good? Too many */ return reporters; } }