Java tutorial
/* * Copyright 2005-2009 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package eu.annocultor.converter; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.Map.Entry; import org.apache.commons.cli.BasicParser; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang.time.StopWatch; import org.xml.sax.SAXException; import eu.annocultor.api.ConverterKernel; import eu.annocultor.api.CustomConverter; import eu.annocultor.api.Factory; import eu.annocultor.api.ObjectRule; import eu.annocultor.api.Reporter; import eu.annocultor.api.DataSource; import eu.annocultor.api.Task; import eu.annocultor.common.Helper; import eu.annocultor.common.Utils; import eu.annocultor.context.Concepts; import eu.annocultor.context.Environment; import eu.annocultor.context.EnvironmentAdapter; import eu.annocultor.context.EnvironmentImpl; import eu.annocultor.context.Namespace; import eu.annocultor.context.Namespaces; import eu.annocultor.data.destinations.RdfGraph; import eu.annocultor.path.Path; import eu.annocultor.rules.ObjectRuleImpl; import eu.annocultor.triple.LiteralValue; import eu.annocultor.triple.Property; import eu.annocultor.triple.Triple; import eu.annocultor.triple.XmlValue; import eu.annocultor.xconverter.api.DataObject; import eu.annocultor.xconverter.api.Graph; /** * XML analyzer: creates a report on XML structure, with separate statistics on * each XML element/attribute data values. * * Requires the XML path to the record separating tag. * * @author Borys Omelayenko * */ public class Analyser extends CustomConverter { private static final String OPT_VALUES = "maxValues"; private static final String OPT_MAX_VALUE_SIZE = "maxValueSize"; private static final String OPT_FN = "fn"; public static int MAX_VALUE_SIZE = 100; public static int MAX_VALUES = 50; static public void main(String... args) throws Exception { // Handling command line parameters with Apache Commons CLI Options options = new Options(); options.addOption(OptionBuilder.withArgName(OPT_FN).hasArg().isRequired() .withDescription("XML file name to be analysed").withValueSeparator(',').create(OPT_FN)); options.addOption(OptionBuilder.withArgName(OPT_MAX_VALUE_SIZE).hasArg().withDescription( "Maximal size when values are counted separately. Longer values are counted altogether. Reasonable values are 100, 300, etc.") .create(OPT_MAX_VALUE_SIZE)); options.addOption(OptionBuilder.withArgName(OPT_VALUES).hasArg().withDescription( "Maximal number of most frequent values displayed in the report. Reasonable values are 10, 25, 50") .create(OPT_VALUES)); // now lets parse the input CommandLineParser parser = new BasicParser(); CommandLine cmd; try { cmd = parser.parse(options, Utils.getCommandLineFromANNOCULTOR_ARGS(args)); } catch (ParseException pe) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("analyse", options); return; } MAX_VALUE_SIZE = Integer.parseInt(cmd.getOptionValue(OPT_MAX_VALUE_SIZE, Integer.toString(MAX_VALUE_SIZE))); MAX_VALUES = Integer.parseInt(cmd.getOptionValue(OPT_VALUES, Integer.toString(MAX_VALUES))); Analyser analyser = new Analyser(new EnvironmentImpl()); // undo: /* analyser.task.setSrcFiles(new File("."), cmd.getOptionValue(OPT_FN)); if (analyser.task.getSrcFiles().size() > 1) { analyser.task.mergeSourceFiles(); } if (analyser.task.getSrcFiles().size() == 0) { throw new Exception("No files to analyze, pattern " + cmd.getOptionValue(OPT_FN)); } File trg = new File(analyser.task.getSrcFiles().get(0).getParentFile(), "rdf"); if (!trg.exists()) trg.mkdir(); System.out.println("[Analysis] Analysing files " + cmd.getOptionValue(OPT_FN) + ", writing analysis to " + trg.getCanonicalPath() + ", max value length (long values are aggregated into one 'long value' value) " + MAX_VALUE_SIZE + ", number most fequently used values per field shown in report " + MAX_VALUES); */ if (true) throw new Exception("unimplemented"); System.exit(analyser.run()); } /** * A facade for a task. * */ private static class AnalyserTask implements Task { Task task; ObjectRule rule; /* * facade on task with the same rule for any tag */ public AnalyserTask(Task task) { super(); this.task = task; } public List<ObjectRule> getObjectRules() { return task.getObjectRules(); } public void setRule(ObjectRule rule) { this.rule = rule; } Path pathResponded = null; public List<ObjectRule> getRuleForSourcePath(Path path) { // here! the same rule for any source path // return Collections.singletonList(rule); if (pathResponded == null) { pathResponded = path; } if (pathResponded.equals(path)) { return Collections.singletonList(rule); } return new ArrayList<ObjectRule>(); } public Environment getEnvironment() { return task.getEnvironment(); } /* * link to the task */ public void addGraph(Graph graph) { task.addGraph(graph); } public void addPartListener(ObjectRule map) { task.addPartListener(map); } public String getDatasetDescription() { return task.getDatasetDescription(); } public String getDatasetId() { return task.getDatasetId(); } public String getDatasetURI() { return task.getDatasetURI(); } public Set<Graph> getGraphs() { return task.getGraphs(); } public Namespace getTargetNamespace() { return task.getTargetNamespace(); } @Override public void setDataSource(DataSource dataSource) throws IOException { task.setDataSource(dataSource); } @Override public DataSource getDataSource() { return task.getDataSource(); } public Reporter getReporter() { return task.getReporter(); } } private static class ValueCount implements Comparable<ValueCount> { public String value; public int count; public ValueCount(String value) { super(); this.value = value; this.count = 0; } @Override public int hashCode() { return value.hashCode(); } @Override public boolean equals(java.lang.Object obj) { return value.equals(obj); } public void inc() { count++; } public String percent(int total) { return "" + ((count * 100) / total); } public int compareTo(ValueCount a) { // reverse order if (this.count < a.count) return 1; if (this.count > a.count) return -1; return 0; } } /* * Statistics is done during passing through the XML file in the same way as * conversion. */ AnalyserTask task; /** * Collected structure + statistics. <code>property, Map(value,count)</code> */ private SortedMap<String, Map<String, ValueCount>> statistics = new TreeMap<String, Map<String, ValueCount>>(); private long passedBytes = 0; private StopWatch elapsed; private ObjectRule recordsMap = null; /* * We save all statistics from time to time, so we have reasonable statistics * before statistics of a large file goes out of memory */ private void computeAndExportStatistics(SortedMap<String, Map<String, ValueCount>> statistics, File tmpDir) throws SAXException { Graph trg = new RdfGraph(null, task.getEnvironment(), "analyse", "", ""); Namespaces namespaces = new Namespaces(); /* * Header */ try { // top how many trg.add(new Triple(Namespaces.ANNOCULTOR_REPORT + "Summary", new Property(Namespaces.ANNOCULTOR_REPORT + "topCount"), new LiteralValue(MAX_VALUES + ""), null)); } catch (Exception e) { throw new SAXException(e); } /* * Here we find top ten and form an RDF report */ for (String propertyName : statistics.keySet()) { StringBuffer message = new StringBuffer(propertyName + " has "); Map<String, ValueCount> values = statistics.get(propertyName); // find top ten int totalRecords = 0; List<ValueCount> topTen = new LinkedList<ValueCount>(); for (String value : values.keySet()) { ValueCount vc = values.get(value); topTen.add(vc); totalRecords += vc.count; } Collections.sort(topTen); // print String propertyUrl = Namespaces.ANNOCULTOR_REPORT + "__" + propertyName.replace('@', 'a').replaceAll(";", "/"); int totalValues = values.size(); message.append(totalValues + " values: "); int i = 0; boolean allUnique = false; try { for (Iterator<ValueCount> it = topTen.iterator(); it.hasNext() && i < MAX_VALUES;) { ValueCount count = it.next(); if (i == 0) { allUnique = (count.count == 1); message.append(allUnique ? " ALL UNIQUE \n" : "\n"); // RDF report on tag trg.add(new Triple(propertyUrl, Concepts.REPORTER.REPORT_NAME, new LiteralValue(propertyName), null)); trg.add(new Triple(propertyUrl, Concepts.REPORTER.REPORT_LABEL, new LiteralValue( Path.formatPath(new Path(propertyName.replace("*", "/")), namespaces)), null)); trg.add(new Triple(propertyUrl, Concepts.REPORTER.REPORT_TOTAL_VALUES, new LiteralValue("" + totalValues), null)); trg.add(new Triple(propertyUrl, Concepts.REPORTER.REPORT_ALL_UNIQUE, new LiteralValue("" + allUnique), null)); } message.append(count.value + (allUnique ? "" : (" (" + count.count + ", " + count.percent(totalRecords) + "%)")) + " \n"); // RDF report on topTen trg.add(new Triple( propertyUrl, Concepts.REPORTER.REPORT_VALUE, new LiteralValue(String.format("%07d", i) + "," + count.count + "," + count.percent(totalRecords) + "," + count.value), null)); i++; } } catch (Exception e) { throw new SAXException(e); } } try { trg.endRdf(); System.out.println("Statistic saved to " + trg.getFinalFile(1).getCanonicalPath()); // transform results Helper.xsl(trg.getFinalFile(1), new File(trg.getFinalFile(1).getCanonicalPath().replaceFirst("\\.rdf", ".html")), this.getClass().getResourceAsStream("/AnalyserReportRDF2HTML.xsl")); } catch (Exception e) { System.out.println(e.getMessage()); throw new SAXException(e); } } public Analyser(Environment environment) throws Exception { super(); task = new AnalyserTask(Factory.makeTask("xml", "", "Statistics report", Namespaces.NS, new AnalyzeEnvironment(environment))); recordsMap = ObjectRuleImpl.makeObjectRule(task, new Path(""), new Path(""), new Path(""), null, true); task.setRule(recordsMap); } private static class AnalyzeEnvironment extends EnvironmentAdapter { public AnalyzeEnvironment(Environment environment) { super(environment); } @Override public void completeWithDefaults() throws Exception { super.completeWithDefaults(); setParameter(PARAMETERS.ANNOCULTOR_OUTPUT_DIR, getDocDir().getCanonicalPath()); } } @Override public int run() throws Exception { ConverterHandler converterHandler = new StatisticsGatheringConverter(this); ConverterKernel converter = new Converter(task, converterHandler, null) { @Override public BufferedInputStream makeInputStream(File src) throws FileNotFoundException { elapsed = new StopWatch(); try { elapsed.start(); } catch (Exception e) { throw new RuntimeException(e); } return new BufferedInputStream(new FileInputStream(src), 1024 * 1024) { @Override public synchronized int read() throws IOException { int r = super.read(); passedBytes += r; return r; } @Override public synchronized int read(byte[] b, int off, int len) throws IOException { int r = super.read(b, off, len); passedBytes += r; return r; } }; } }; return converter.convert(); } /** * Overrides endDocument to compute statistics. * */ private class StatisticsGatheringConverter extends ConverterHandler { Analyser analyser; public StatisticsGatheringConverter(Analyser analyser) { super(analyser.task); this.analyser = analyser; } @Override protected String getTopCompletedPartSubject() { // simulate one object return "OBJECT";// + ( nr ++) ; } @Override public void multiFileEndDocument() throws SAXException { super.multiFileEndDocument(); log.info("Writing statistcis in RDF"); computeAndExportStatistics(statistics, analyser.task.getEnvironment().getTmpDir()); } @Override protected DataObject makeDataObjectForNewRecord(Path path, ObjectRule rule, DataObject parent) { return new StatisticsDataObject(path, rule, parent, statistics, analyser); } } private static class StatisticsDataObject extends DataObjectImpl { SortedMap<String, Map<String, ValueCount>> statistics = null; static long passedTags = 0; Analyser analyser; public StatisticsDataObject(Path path, ObjectRule rule, DataObject parent, SortedMap<String, Map<String, ValueCount>> statistics, Analyser analyser) { super(path, rule, parent); this.statistics = statistics; this.analyser = analyser; } @Override public ListOfValues getValues(Path query) { ListOfValues r = new ListOfValues(); r.add(new XmlValue("dummy")); return r; } @Override public void addValue(Path path2, XmlValue newValue) throws Exception { // all explicated properties Set<Path> allProperties = new HashSet<Path>(); allProperties.addAll(path2.explicate()); // count per explicated property for (Path path : allProperties) { String pathStr = path.getPath(); Map<String, ValueCount> propertyStats = statistics.get(pathStr); if (propertyStats == null) { propertyStats = new TreeMap<String, ValueCount>(); statistics.put(pathStr, propertyStats); } // per value String value = path.isAttributeQuery() ? path.getValue() : newValue.getValue().trim(); if (MAX_VALUE_SIZE > 0 && value.length() > MAX_VALUE_SIZE) { value = "LONG VALUE (MAX " + MAX_VALUE_SIZE + ")"; } ValueCount vc = propertyStats.get(value); if (vc == null) { vc = new ValueCount(value); propertyStats.put(value, vc); } vc.inc(); } final int _1000 = 100000; if (passedTags % _1000 == 0) { int distinctCounts = 0; int distinctValues = 1; System.out.print("Prop stat:"); for (Entry<String, Map<String, ValueCount>> e : statistics.entrySet()) { System.out.print(e.getValue().size() + ","); for (Entry<String, ValueCount> ev : e.getValue().entrySet()) { distinctValues++; distinctCounts += ev.getValue().count; } } System.out.println(); System.out.println("Passed: " + (analyser.elapsed.getTime() / 1000) + " s, " + (analyser.passedBytes / 1000000) + " Mb XML, " + passedTags + " values, " + statistics.size() + " paths, " + distinctValues + " distinct values, " + (distinctValues / (statistics.size() + 1)) + " av values per property, " + (distinctCounts / distinctValues) + " occurrences per value"); System.out.print("Saving stats..."); analyser.computeAndExportStatistics(statistics, new File(".")); System.out.println("done."); } passedTags++; } } }