Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.any23; import junit.framework.Assert; import org.apache.any23.configuration.DefaultConfiguration; import org.apache.any23.configuration.ModifiableConfiguration; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.Extractor; import org.apache.any23.extractor.microdata.MicrodataExtractor; import org.apache.any23.filter.IgnoreAccidentalRDFa; import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; import org.apache.any23.http.DefaultHTTPClient; import org.apache.any23.http.HTTPClient; import org.apache.any23.source.DocumentSource; import org.apache.any23.source.HTTPDocumentSource; import org.apache.any23.source.StringDocumentSource; import org.apache.any23.util.FileUtils; import org.apache.any23.util.StreamUtils; import org.apache.any23.util.StringUtils; import org.apache.any23.vocab.DCTerms; import org.apache.any23.writer.CompositeTripleHandler; import org.apache.any23.writer.CountingTripleHandler; import org.apache.any23.writer.NTriplesWriter; import org.apache.any23.writer.RDFXMLWriter; import org.apache.any23.writer.ReportingTripleHandler; import org.apache.any23.writer.RepositoryWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.apache.commons.io.IOUtils; import org.junit.Ignore; import org.junit.Test; import org.openrdf.model.Statement; import org.openrdf.repository.RepositoryConnection; import org.openrdf.repository.RepositoryException; import org.openrdf.repository.RepositoryResult; import org.openrdf.repository.sail.SailRepository; import org.openrdf.rio.RDFParseException; import org.openrdf.sail.Sail; import org.openrdf.sail.SailException; import org.openrdf.sail.memory.MemoryStore; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URISyntaxException; import java.util.List; import static org.apache.any23.extractor.ExtractionParameters.ValidationMode; /** * Test case for {@link Any23} facade. * @author Davide Palmisano ( dpalmisano@gmail.com ) * @author Michele Mostarda ( michele.mostarda@gmail.com ) */ @SuppressWarnings("unchecked") public class Any23Test extends Any23OnlineTestBase { private static final DCTerms vDCTERMS = DCTerms.getInstance(); private static final String PAGE_URL = "http://bob.com"; private static final Logger logger = LoggerFactory.getLogger(Any23Test.class); @Test public void testTTLDetection() throws Exception { assertDetection("<a> <b> <c> .", "rdf-turtle"); } @Test public void testN3Detection1() throws Exception { assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle"); } @Test public void testN3Detection2() throws Exception { assertDetection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .", "rdf-nt"); } @Test public void testHTMLBruteForceDetection() throws Exception { assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>"); } /** * This tests the behavior of <i>Any23</i> to execute the extraction explicitly specifying the charset * encoding of the input. * * @throws org.apache.any23.extractor.ExtractionException * @throws IOException * @throws SailException * @throws RepositoryException */ @Test public void testExplicitEncoding() throws Exception { assertEncodingDetection("UTF-8", "/html/encoding-test.html", "Knud M\u00F6ller"); } /** * This tests the behavior of <i>Any23</i> to perform the extraction without passing it any charset encoding. * The encoding is therefore guessed using {@link org.apache.any23.encoding.TikaEncodingDetector} class. * * @throws org.apache.any23.extractor.ExtractionException * @throws IOException * @throws SailException * @throws RepositoryException * @throws org.apache.any23.writer.TripleHandlerException */ @Test public void testImplicitEncoding() throws Exception { assertEncodingDetection(null, // The encoding will be auto detected. "/html/encoding-test.html", "Knud M\u00F6ller"); } @Test public void testRDFXMLDetectionAndExtraction() throws Exception { String rdfXML = "<?xml version='1.0'?> " + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' " + "xmlns:dc='http://purl.org/dc/elements/1.1/'>" + "<rdf:Description rdf:about='http://www.example.com'>" + "<dc:title>x</dc:title>" + "</rdf:Description>" + "</rdf:RDF>"; assertDetectionAndExtraction(rdfXML); } @Test public void testNTriplesDetectionAndExtraction() throws Exception { String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" ."; assertDetectionAndExtraction(n3); } @Test public void testNturtleDetectionAndExtraction() throws Exception { String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n" + "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n" + "@prefix ex: <http://example.org/stuff/1.0/> .\n" + "\n" + "<http://www.w3.org/TR/rdf-syntax-grammar>\n" + " dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n" + " ex:editor [\n" + " ex:fullname \"Dave Beckett\";\n" + " ex:homePage <http://purl.org/net/dajobe/>\n" + " ] ."; assertDetectionAndExtraction(nTurtle); } /** * Tests out the first code snipped used in <i>Developer Manual</i>. * * @throws IOException * @throws org.apache.any23.extractor.ExtractionException */ @Test public void testDemoCodeSnippet1() throws Exception { /*1*/ Any23 runner = new Any23(); /*2*/ final String content = "@prefix foo: <http://example.org/ns#> . " + "@prefix : <http://other.example.org/ns#> ." + "foo:bar foo: : . " + ":bar : foo:bar . "; // The second argument of StringDocumentSource() must be a valid URI. /*3*/ DocumentSource source = new StringDocumentSource(content, "http://host.com/service"); /*4*/ ByteArrayOutputStream out = new ByteArrayOutputStream(); /*5*/ TripleHandler handler = new NTriplesWriter(out); try { /*6*/ runner.extract(source, handler); } finally { /*7*/ handler.close(); } /*8*/ String nt = out.toString("UTF-8"); /* <http://example.org/ns#bar> <http://example.org/ns#> <http://other.example.org/ns#> . <http://other.example.org/ns#bar> <http://other.example.org/ns#> <http://example.org/ns#bar> . */ logger.debug("nt: " + nt); Assert.assertTrue(nt.length() > 0); } /** * This test checks the extraction behavior when the library is used programatically. * This test is related to the issue #45, to verify the different behaviors between Maven and Ant. * The behavior was related to a 2nd-level dependency introduced by Maven. * * @throws org.apache.any23.extractor.ExtractionException * @throws IOException * @throws URISyntaxException */ @Test public void testProgrammaticExtraction() throws ExtractionException, IOException, URISyntaxException { Any23 any23 = new Any23(); any23.setHTTPUserAgent("Any23-Servlet"); any23.setHTTPClient(new DefaultHTTPClient() { @Override protected int getConnectionTimeout() { return 5000; } @Override protected int getSoTimeout() { return 2000; } }); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); TripleHandler handler = new RDFXMLWriter(byteArrayOutputStream); TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler); ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter); DocumentSource source = getDocumentSourceFromResource("/html/rdfa/ansa_2010-02-26_12645863.html", "http://host.com/service"); Assert.assertTrue(any23.extract(source, reporting).hasMatchingExtractors()); try { handler.close(); } catch (TripleHandlerException e) { Assert.fail(e.getMessage()); } final String bufferContent = byteArrayOutputStream.toString(); logger.debug(bufferContent); Assert.assertSame("Unexpected number of triples.", 60, StringUtils.countNL(bufferContent)); } /** * This test checks if a URL that is supposed to be GZIPPED is correctly opened and parsed with * the {@link Any23} facade. * * @throws IOException * @throws URISyntaxException * @throws ExtractionException */ @Ignore("ANY23-140 - Revise Any23 tests to remove fetching of web content") @Test public void testGZippedContent() throws IOException, URISyntaxException, ExtractionException { assumeOnlineAllowed(); Any23 runner = new Any23(); runner.setHTTPUserAgent("test-user-agent"); HTTPClient httpClient = runner.getHTTPClient(); DocumentSource source = new HTTPDocumentSource(httpClient, "http://products.semweb.bestbuy.com/y/products/7590289/"); ByteArrayOutputStream out = new ByteArrayOutputStream(); TripleHandler handler = new NTriplesWriter(out); runner.extract(source, handler); String n3 = out.toString("UTF-8"); logger.debug("N3 " + n3); Assert.assertTrue(n3.length() > 0); } @Test public void testExtractionParameters() throws IOException, ExtractionException, TripleHandlerException { final int EXPECTED_TRIPLES = 6; Any23 runner = new Any23(); DocumentSource source = getDocumentSourceFromResource( "/org/apache/any23/validator/missing-og-namespace.html", "http://www.test.com"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); CountingTripleHandler cth1 = new CountingTripleHandler(); NTriplesWriter ctw1 = new NTriplesWriter(baos); CompositeTripleHandler compositeTH1 = new CompositeTripleHandler(); compositeTH1.addChild(cth1); compositeTH1.addChild(ctw1); try { runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.None), source, compositeTH1); } finally { compositeTH1.close(); } logger.info(baos.toString()); Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth1.getCount()); baos.reset(); CountingTripleHandler cth2 = new CountingTripleHandler(); NTriplesWriter ctw2 = new NTriplesWriter(baos); CompositeTripleHandler compositeTH2 = new CompositeTripleHandler(); compositeTH2.addChild(cth2); compositeTH2.addChild(ctw2); runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.ValidateAndFix), source, compositeTH2); logger.debug(baos.toString()); Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES + 5, cth2.getCount()); } @Test public void testExtractionParametersWithNestingDisabled() throws IOException, ExtractionException, TripleHandlerException { final int EXPECTED_TRIPLES = 19; Any23 runner = new Any23(); DocumentSource source = getDocumentSourceFromResource("/microformats/nested-microformats-a1.html", "http://www.test.com"); ByteArrayOutputStream baos = new ByteArrayOutputStream(); CountingTripleHandler cth1 = new CountingTripleHandler(); RDFXMLWriter ctw1 = new RDFXMLWriter(baos); CompositeTripleHandler compositeTH1 = new CompositeTripleHandler(); compositeTH1.addChild(cth1); compositeTH1.addChild(ctw1); runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.None, true), source, compositeTH1); compositeTH1.close(); logger.debug("Out1: " + baos.toString()); Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES + 3, cth1.getCount()); baos.reset(); CountingTripleHandler cth2 = new CountingTripleHandler(); NTriplesWriter ctw2 = new NTriplesWriter(baos); CompositeTripleHandler compositeTH2 = new CompositeTripleHandler(); compositeTH2.addChild(cth2); compositeTH2.addChild(ctw2); runner.extract( new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.ValidateAndFix, false), source, compositeTH2); compositeTH2.close(); logger.debug("Out2: " + baos.toString()); Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth2.getCount()); } @Test public void testExceptionPropagation() throws IOException { Any23 any23 = new Any23(); DocumentSource source = getDocumentSourceFromResource("/application/turtle/geolinkeddata.ttl", "http://www.test.com"); CountingTripleHandler cth1 = new CountingTripleHandler(); try { any23.extract(source, cth1); } catch (ExtractionException e) { Assert.assertTrue(e.getCause() instanceof RDFParseException); } } /** * Test correct management of general <i>XML</i> content. * * @throws IOException * @throws ExtractionException */ @Test public void testXMLMimeTypeManagement() throws IOException, ExtractionException { final String documentURI = "http://www.test.com/resource.xml"; final String contentType = "application/xml"; final String in = StreamUtils.asString(this.getClass().getResourceAsStream("any23-xml-mimetype.xml")); final DocumentSource doc = new StringDocumentSource(in, documentURI, contentType); final Any23 any23 = new Any23(); final CountingTripleHandler cth = new CountingTripleHandler(false); final ReportingTripleHandler rth = new ReportingTripleHandler(cth); final ExtractionReport report = any23.extract(doc, rth); Assert.assertFalse(report.hasMatchingExtractors()); Assert.assertEquals(0, cth.getCount()); } /** * Test correct management of general <i>XML</i> content from <i>URL</i> source. * * @throws IOException * @throws ExtractionException */ @Test public void testXMLMimeTypeManagementViaURL() throws IOException, ExtractionException { assumeOnlineAllowed(); final Any23 any23 = new Any23(); any23.setHTTPUserAgent("test-user-agent"); final CountingTripleHandler cth = new CountingTripleHandler(false); final ReportingTripleHandler rth = new ReportingTripleHandler(cth); final ExtractionReport report = any23.extract("http://www.nativeremedies.com/XML/combos.xml", rth); Assert.assertFalse(report.hasMatchingExtractors()); Assert.assertEquals(0, cth.getCount()); } @Test public void testMicrodataSupport() throws Exception { final String htmlWithMicrodata = IOUtils .toString(this.getClass().getResourceAsStream("/microdata/microdata-basic.html")); assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class); } @Test public void testAbstractMethodErrorIssue186_1() throws IOException, ExtractionException { final Any23 runner = new Any23(); final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml"); final DocumentSource source = new StringDocumentSource(content, "http://base.com"); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final TripleHandler handler = new NTriplesWriter(out); runner.extract(source, handler); String n3 = out.toString("UTF-8"); logger.debug(n3); } @Test public void testAbstractMethodErrorIssue186_2() throws IOException, ExtractionException { final Any23 runner = new Any23(); final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml"); final DocumentSource source = new StringDocumentSource(content, "http://richard.cyganiak.de/"); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final TripleHandler handler = new NTriplesWriter(out); runner.extract(source, handler); final String n3 = out.toString("UTF-8"); logger.debug(n3); } @Test public void testModifiableConfiguration_issue183() throws Exception { final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy(); modifiableConf.setProperty("any23.extraction.metadata.timesize", "off"); final Any23 any23 = new Any23(modifiableConf); final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl"); final DocumentSource source = new StringDocumentSource(content, "http://base.com"); final ByteArrayOutputStream out = new ByteArrayOutputStream(); final TripleHandler handler = new NTriplesWriter(out); any23.extract(source, handler); handler.close(); final String n3 = out.toString("UTF-8"); logger.debug(n3); Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/date", n3.contains("http://vocab.sindice.net/date")); Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/size", n3.contains("http://vocab.sindice.net/size")); } /** * Performs detection and extraction on the given input string * and return the {@link ExtractionReport}. * * @param in input string. * @return * @throws IOException * @throws ExtractionException */ private ExtractionReport detectAndExtract(String in) throws Exception { Any23 any23 = new Any23(); ByteArrayOutputStream out = new ByteArrayOutputStream(); ReportingTripleHandler outputHandler = new ReportingTripleHandler( new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(new NTriplesWriter(out)))); return any23.extract(in, "http://host.com/path", outputHandler); } /** * Asserts that a list an {@link Extractor} has been activated for the given input data. * * @param in input data as string. * @throws IOException * @throws ExtractionException */ private void assertDetectionAndExtraction(String in) throws Exception { final ExtractionReport extractionReport = detectAndExtract(in); Assert.assertTrue("Detection and extraction failed, no matching extractors.", extractionReport.hasMatchingExtractors()); } /** * Assert the correct activation of the given list of {@link Extractor}s for the given input string. * * @param in input data as string. * @param expectedExtractors * @throws IOException * @throws ExtractionException */ private void assertExtractorActivation(String in, Class<? extends Extractor>... expectedExtractors) throws Exception { final ExtractionReport extractionReport = detectAndExtract(in); for (Class<? extends Extractor> expectedExtractorClass : expectedExtractors) { Assert.assertTrue( String.format("Detection and extraction failed, expected extractor [%s] not found.", expectedExtractorClass), containsClass(extractionReport.getMatchingExtractors(), expectedExtractorClass)); } } /** * Asserts the correct encoding detection for a specified data. * * @param encoding the expected specified encoding, if <code>null</code> will be auto detected. * @param input * @param expectedContent * @throws Exception */ private void assertEncodingDetection(String encoding, String input, String expectedContent) throws Exception { DocumentSource fileDocumentSource = getDocumentSourceFromResource(input); Any23 any23; RepositoryConnection conn; RepositoryWriter repositoryWriter; any23 = new Any23(); Sail store = new MemoryStore(); store.initialize(); conn = new SailRepository(store).getConnection(); repositoryWriter = new RepositoryWriter(conn); Assert.assertTrue(any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors()); RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false); try { while (statements.hasNext()) { Statement statement = statements.next(); printStatement(statement); org.junit.Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent)); } } finally { statements.close(); } fileDocumentSource = null; any23 = null; conn.close(); repositoryWriter.close(); } /** * Will try to detect the <i>content</i> trying sequentially with all * specified parser. * * @param content * @param parsers * @throws Exception */ private void assertDetection(String content, String... parsers) throws Exception { ByteArrayOutputStream out = new ByteArrayOutputStream(); Any23 runner = new Any23(parsers.length == 0 ? null : parsers); if (parsers.length != 0) { runner.setMIMETypeDetector(null); // Use all the provided extractors. } final NTriplesWriter tripleHandler = new NTriplesWriter(out); runner.extract(new StringDocumentSource(content, PAGE_URL), tripleHandler); tripleHandler.close(); String result = out.toString("us-ascii"); Assert.assertNotNull(result); Assert.assertTrue(result.length() > 10); } private void printStatement(Statement statement) { logger.debug(String.format("%s\t%s\t%s", statement.getSubject(), statement.getPredicate(), statement.getObject())); } private boolean containsClass(List<?> list, Class clazz) { for (Object o : list) { if (o.getClass().equals(clazz)) { return true; } } return false; } }