Java tutorial
/* * Copyright 2015 Crosstree Labs. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.crosstreelabs.cognitio.gumshoe.format; import com.crosstreelabs.cognitio.api.extension.FormatHandler; import com.crosstreelabs.cognitio.api.resource.Visit; import de.l3s.boilerpipe.BoilerpipeProcessingException; import de.l3s.boilerpipe.extractors.ArticleExtractor; import io.mola.galimatias.GalimatiasParseException; import io.mola.galimatias.URL; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.net.URISyntaxException; import java.util.logging.Level; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang3.StringUtils; import org.apache.http.client.utils.URIBuilder; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HtmlFormatHandler implements FormatHandler { private static final Logger LOGGER = LoggerFactory.getLogger(HtmlFormatHandler.class); @Override public boolean handles(final Visit visit) { if ("text/html".equals(visit.contentType)) { return true; } if (visit.contentStream == null) { return false; } try { if (!(visit.contentStream instanceof ByteArrayInputStream)) { ByteArrayOutputStream baos = new ByteArrayOutputStream(); IOUtils.copy(visit.contentStream, baos); visit.contentStream = new ByteArrayInputStream(baos.toByteArray()); } byte[] buf = new byte[250]; IOUtils.read(visit.contentStream, buf, 0, 250); visit.contentStream.reset(); String chunk = new String(buf); return chunk.toLowerCase().contains("<html"); } catch (IOException ex) { } return false; } @Override public void handle(final Visit visit) { if (!handles(visit)) { throw new UnsupportedOperationException("Cannot handle resource"); } processLinks(visit); processContent(visit); } @Override public void processLinks(final Visit visit) { try { String charset = StringUtils.defaultIfBlank(visit.contentCharset, "UTF-8"); Document doc = Jsoup.parse(visit.contentStream, charset, visit.result.location); Elements anchors = doc.getElementsByTag("a"); for (Element e : anchors) { String url = stripURLFragmentIdentifier(e.attr("abs:href")); String uri = stripURLFragmentIdentifier(e.attr("href").toLowerCase()); if (uri.isEmpty() || url.isEmpty() || uri.contains("javascript:") || uri.contains("mailto:") || uri.contains("@")) { continue; } visit.discoveredLinks.add(URL.parse(url).toString()); // TODO Need to add the link text as the title } visit.contentStream.reset(); } catch (GalimatiasParseException | IOException ex) { throw new RuntimeException(ex); } } @Override public void processContent(final Visit visit) { try { visit.result.description = ArticleExtractor.getInstance() .getText(new InputStreamReader(visit.contentStream)); } catch (BoilerpipeProcessingException ex) { throw new RuntimeException(ex); } } protected static String stripURLFragmentIdentifier(final String url) { if (!url.contains("#")) { return url; } int pos = url.indexOf("#"); int pos2 = url.indexOf("#!"); if (pos == pos2) { return url; } return url.substring(0, pos); } }