Java tutorial
/* * Copyright 2011 Peter Karich * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations * under the License. */ package de.jetwick.snacktory; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.Proxy; import java.util.LinkedHashSet; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.GZIPInputStream; import java.util.zip.Inflater; import java.util.zip.InflaterInputStream; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.config.CookieSpecs; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpHead; import org.apache.http.client.methods.HttpRequestBase; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Class to fetch articles. This class is thread safe. * * @author Peter Karich */ public class HtmlFetcher { static { SHelper.enableCookieMgmt(); SHelper.enableUserAgentOverwrite(); SHelper.enableAnySSL(); } private static final Logger logger = LoggerFactory.getLogger(HtmlFetcher.class); public static void main(String[] args) throws Exception { BufferedReader reader = new BufferedReader(new FileReader("urls.txt")); String line = null; Set<String> existing = new LinkedHashSet<String>(); while ((line = reader.readLine()) != null) { int index1 = line.indexOf("\""); int index2 = line.indexOf("\"", index1 + 1); String url = line.substring(index1 + 1, index2); String domainStr = SHelper.extractDomain(url, true); String counterStr = ""; // TODO more similarities if (existing.contains(domainStr)) counterStr = "2"; else existing.add(domainStr); String html = new HtmlFetcher().fetchAsString(url, 20000); String outFile = domainStr + counterStr + ".html"; BufferedWriter writer = new BufferedWriter(new FileWriter(outFile)); writer.write(html); writer.close(); } reader.close(); } private String referrer = "https://app.appgree.com"; private String userAgent = "Mozilla/5.0 (compatible; Snacktory; " + referrer + ")"; private String cacheControl = "max-age=0"; private String language = "en-us"; private String accept = "application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"; private String charset = "UTF-8"; private SCache cache; private Proxy proxy = null; private AtomicInteger cacheCounter = new AtomicInteger(0); private int maxTextLength = -1; private ArticleTextExtractor extractor = new ArticleTextExtractor(); @SuppressWarnings("serial") private Set<String> furtherResolveNecessary = new LinkedHashSet<String>() { { add("bit.ly"); add("cli.gs"); add("deck.ly"); add("fb.me"); add("feedproxy.google.com"); add("flic.kr"); add("fur.ly"); add("goo.gl"); add("is.gd"); add("ink.co"); add("j.mp"); add("lnkd.in"); add("on.fb.me"); add("ow.ly"); add("plurl.us"); add("sns.mx"); add("snurl.com"); add("su.pr"); add("t.co"); add("tcrn.ch"); add("tl.gd"); add("tiny.cc"); add("tinyurl.com"); add("tmi.me"); add("tr.im"); add("twurl.nl"); add("kcy.me"); } }; public HtmlFetcher() { } public void setExtractor(ArticleTextExtractor extractor) { this.extractor = extractor; } public ArticleTextExtractor getExtractor() { return extractor; } public HtmlFetcher setCache(SCache cache) { this.cache = cache; return this; } public SCache getCache() { return cache; } public int getCacheCounter() { return cacheCounter.get(); } public HtmlFetcher clearCacheCounter() { cacheCounter.set(0); return this; } public HtmlFetcher setMaxTextLength(int maxTextLength) { this.maxTextLength = maxTextLength; return this; } public int getMaxTextLength() { return maxTextLength; } public void setAccept(String accept) { this.accept = accept; } public void setCharset(String charset) { this.charset = charset; } public void setCacheControl(String cacheControl) { this.cacheControl = cacheControl; } public String getLanguage() { return language; } public void setLanguage(String language) { this.language = language; } public String getReferrer() { return referrer; } public HtmlFetcher setReferrer(String referrer) { this.referrer = referrer; return this; } public String getUserAgent() { return userAgent; } public void setUserAgent(String userAgent) { this.userAgent = userAgent; } public String getAccept() { return accept; } public String getCacheControl() { return cacheControl; } public String getCharset() { return charset; } public void setProxy(Proxy proxy) { this.proxy = proxy; } public Proxy getProxy() { return (proxy != null ? proxy : Proxy.NO_PROXY); } public boolean isProxySet() { return getProxy() != null; } public JResult fetchAndExtract(String url, int timeout, boolean resolve) throws Exception { JResult result = new JResult(); try { String originalUrl = url; url = SHelper.removeHashbang(url); String gUrl = SHelper.getUrlFromUglyGoogleRedirect(url); if (gUrl != null) url = gUrl; else { gUrl = SHelper.getUrlFromUglyFacebookRedirect(url); if (gUrl != null) url = gUrl; } if (resolve) { // check if we can avoid resolving the URL (which hits the website!) JResult res = getFromCache(url, originalUrl); if (res != null) return res; String resUrl = getResolvedUrl(url, timeout); if (resUrl.isEmpty()) { if (logger.isDebugEnabled()) logger.warn("resolved url is empty. Url is: " + url); result = new JResult(); if (cache != null) cache.put(url, result); return result.setUrl(url); } // if resolved url is longer then use it! if (resUrl != null && resUrl.trim().length() > url.length()) { // this is necessary e.g. for some homebaken url resolvers which return // the resolved url relative to url! url = SHelper.useDomainOfFirstArg4Second(url, resUrl); } } // check if we have the (resolved) URL in cache JResult res = getFromCache(url, originalUrl); if (res != null) return res; result = new JResult(); // or should we use? <link rel="canonical" href="http://www.N24.de/news/newsitem_6797232.html"/> result.setUrl(url); result.setOriginalUrl(originalUrl); result.setDate(SHelper.estimateDate(url)); // Immediately put the url into the cache as extracting content takes time. if (cache != null) { cache.put(originalUrl, result); cache.put(url, result); } String lowerUrl = url.toLowerCase(); if (SHelper.isDoc(lowerUrl) || SHelper.isApp(lowerUrl) || SHelper.isPackage(lowerUrl)) { // skip } else if (SHelper.isVideo(lowerUrl) || SHelper.isAudio(lowerUrl)) { result.setVideoUrl(url); } else if (SHelper.isImage(lowerUrl)) { result.setImageUrl(url); } else { extractor.extractContent(url, result, fetchAsString(url, timeout)); if (result.getFaviconUrl().isEmpty()) result.setFaviconUrl(SHelper.getDefaultFavicon(url)); // some links are relative to root and do not include the domain of the url :( result.setFaviconUrl(fixUrl(url, result.getFaviconUrl())); result.setImageUrl(fixUrl(url, result.getImageUrl())); result.setVideoUrl(fixUrl(url, result.getVideoUrl())); result.setRssUrl(fixUrl(url, result.getRssUrl())); } result.setText(lessText(result.getText())); synchronized (result) { result.notifyAll(); } return result; } catch (Exception e) { return result; } } public String lessText(String text) { if (text == null) return ""; if (maxTextLength >= 0 && text.length() > maxTextLength) return text.substring(0, maxTextLength); return text; } private static String fixUrl(String url, String urlOrPath) { return SHelper.useDomainOfFirstArg4Second(url, urlOrPath); } public String fetchAsString(String urlAsString, int timeout) throws MalformedURLException, IOException { return fetchAsString(urlAsString, timeout, true); } public String fetchAsString(String urlAsString, int timeout, boolean includeSomeGooseOptions) throws MalformedURLException, IOException { urlAsString = urlAsString.replace("https", "http"); CloseableHttpResponse response = createUrlConnection(urlAsString, timeout, includeSomeGooseOptions, false); if (response.getStatusLine().getStatusCode() > 399) { throw new MalformedURLException(response.getStatusLine().toString()); } Header header = response.getFirstHeader("Content-Type"); String encoding = null; if (header == null) { encoding = "utf-8"; } else { encoding = header.getValue(); if (encoding == null || !encoding.startsWith("text")) { throw new MalformedURLException("Not an HTML content!"); } } String res = null; try { final HttpEntity body = response.getEntity(); InputStream is; if (encoding != null && encoding.equalsIgnoreCase("gzip")) { is = new GZIPInputStream(body.getContent()); } else if (encoding != null && encoding.equalsIgnoreCase("deflate")) { is = new InflaterInputStream(body.getContent(), new Inflater(true)); } else { is = body.getContent(); } String enc = Converter.extractEncoding(encoding); res = createConverter(urlAsString).streamToString(is, enc); EntityUtils.consume(body); if (logger.isDebugEnabled()) logger.debug(res.length() + " FetchAsString:" + urlAsString); } finally { response.close(); } return res; } public Converter createConverter(String url) { return new Converter(url); } /** * On some devices we have to hack: http://developers.sun.com/mobility/reference/techart/design_guidelines/http_redirection.html * * @param timeout Sets a specified timeout value, in milliseconds * @return the resolved url if any. Or null if it couldn't resolve the url (within the specified time) or the same url if response code is OK */ public String getResolvedUrl(String urlAsString, int timeout) { int responseCode = -1; String newUrl = null; try { urlAsString = urlAsString.replace("https", "http"); CloseableHttpResponse response = createUrlConnection(urlAsString, timeout, true, true); responseCode = response.getStatusLine().getStatusCode(); if (responseCode == HttpStatus.SC_OK) return urlAsString; Header location = response.getLastHeader("Location"); if (responseCode / 100 == 3 && location != null) { newUrl = location.getValue().replaceAll(" ", "+"); // some services use (none-standard) utf8 in their location header if (urlAsString.startsWith("http://bit.ly") || urlAsString.startsWith("http://is.gd")) newUrl = encodeUriFromHeader(newUrl); // fix problems if shortened twice. as it is often the case after twitters' t.co bullshit if (furtherResolveNecessary.contains(SHelper.extractDomain(newUrl, true))) newUrl = getResolvedUrl(newUrl, timeout); return newUrl; } else return urlAsString; } catch (Exception ex) { logger.warn("getResolvedUrl:" + urlAsString + " Error:" + ex.getMessage(), ex); return ""; } finally { if (logger.isDebugEnabled()) logger.debug(responseCode + " url:" + urlAsString + " resolved:" + newUrl); } } /** * Takes a URI that was decoded as ISO-8859-1 and applies percent-encoding to non-ASCII characters. Workaround for broken origin servers that send * UTF-8 in the Location: header. */ static String encodeUriFromHeader(String badLocation) { StringBuilder sb = new StringBuilder(); for (char ch : badLocation.toCharArray()) { if (ch < (char) 128) { sb.append(ch); } else { // this is ONLY valid if the uri was decoded using ISO-8859-1 sb.append(String.format("%%%02X", (int) ch)); } } return sb.toString(); } protected CloseableHttpResponse createUrlConnection(String urlAsStr, int timeout, boolean includeSomeGooseOptions, boolean isHead) throws MalformedURLException, IOException { CloseableHttpClient httpclient = HttpClients.createDefault(); HttpRequestBase request = null; if (isHead) { request = new HttpHead(urlAsStr); } else { request = new HttpGet(urlAsStr); } RequestConfig requestConfig = RequestConfig.custom().setConnectionRequestTimeout(timeout) .setConnectTimeout(timeout).setSocketTimeout(timeout).setCookieSpec(CookieSpecs.STANDARD).build(); request.setHeader("User-Agent", userAgent); request.setHeader("Accept", accept); if (includeSomeGooseOptions) { request.setHeader("Accept-Language", language); request.setHeader("content-charset", charset); request.setHeader("Referer", referrer); // avoid the cache for testing purposes only? request.setHeader("Cache-Control", cacheControl); } // suggest respond to be gzipped or deflated (which is just another compression) // http://stackoverflow.com/q/3932117 request.setHeader("Accept-Encoding", "gzip, deflate"); request.setConfig(requestConfig); return httpclient.execute(request); } private JResult getFromCache(String url, String originalUrl) throws Exception { if (cache != null) { JResult res = cache.get(url); if (res != null) { // e.g. the cache returned a shortened url as original url now we want to store the // current original url! Also it can be that the cache response to url but the JResult // does not contain it so overwrite it: res.setUrl(url); res.setOriginalUrl(originalUrl); cacheCounter.addAndGet(1); return res; } } return null; } }