Java tutorial
package uk.ac.ebi.arrayexpress.utils.saxon.search; /* * Copyright 2009-2011 European Molecular Biology Laboratory * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ import java.io.File; import java.io.IOException; import java.io.StringReader; import java.math.BigDecimal; import java.math.BigInteger; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import javax.xml.transform.stream.StreamSource; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import net.sf.saxon.Configuration; import net.sf.saxon.om.DocumentInfo; import net.sf.saxon.om.NodeInfo; import net.sf.saxon.xpath.XPathEvaluator; import org.apache.commons.configuration.HierarchicalConfiguration; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.facet.index.CategoryDocumentBuilder; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TotalHitCountCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xmldb.api.DatabaseManager; import org.xmldb.api.base.Collection; import org.xmldb.api.base.ResourceIterator; import org.xmldb.api.base.ResourceSet; import org.xmldb.api.modules.XPathQueryService; import uk.ac.ebi.arrayexpress.app.Application; import uk.ac.ebi.arrayexpress.components.SaxonEngine; import uk.ac.ebi.arrayexpress.components.XmlDbConnectionPool; import uk.ac.ebi.arrayexpress.utils.HttpServletRequestParameterMap; import uk.ac.ebi.arrayexpress.utils.StringTools; import uk.ac.ebi.arrayexpress.utils.saxon.PrintUtils; import uk.ac.ebi.arrayexpress.utils.saxon.search.AbstractIndexEnvironment.AttsInfo; public abstract class AbstractIndexEnvironment { // logging machinery private final Logger logger = LoggerFactory.getLogger(getClass()); // source index configuration (will be eventually removed) public HierarchicalConfiguration indexConfig; // index configuration, parsed public String indexId; public Directory indexDirectory; // I need this to create an temporary directory during the relod job // execution public String indexLocationDirectory; public PerFieldAnalyzerWrapper indexAnalyzer; public String defaultField; // I will not open the index in each request private IndexReader ir = null; // index document xpath public String indexDocumentPath; // number of documents indexed private int countDocuments; public int getCountDocuments() { return countDocuments; } public void setCountDocuments(int count) { this.countDocuments = count; } public String getDefaultField() { return defaultField; } // private Map<String, XPathExpression> fieldXpe = new HashMap<String, // XPathExpression>(); // keep information realted with attributes public class AttsInfo { public String name; public String type; public AttsInfo(String name, String type) { setName(name); setType(type); } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getType() { return type; } public void setType(String type) { this.type = type; } } // TODO: rpe (review this) public IndexReader getIndexReader() { if (ir == null) { synchronized (this) { try { // logger.debug("test"); ir = IndexReader.open(this.indexDirectory, true); } catch (Exception e) { e.printStackTrace(); } } } return ir; } public void closeIndexReader() { if (ir != null) { try { logger.debug("Close the closeIndexReader!!!"); ir.close(); ir = null; } catch (Exception e) { logger.error("ERROR closeIndexReader!!!"); e.printStackTrace(); } } } // TODO: rpe Just to test // private void closeIndexReader(){ // ir=null; // } public void setDefaultField(String defaultField) { this.defaultField = defaultField; } public String getDefaultSortField() { return defaultSortField; } public void setDefaultSortField(String defaultSortField) { this.defaultSortField = defaultSortField; } public boolean getDefaultSortDescending() { return defaultSortDescending; } public void setDefaultSortDescending(boolean defaultSortDescending) { this.defaultSortDescending = defaultSortDescending; } public int getDefaultPageSize() { return defaultPageSize; } public void setDefaultPageSize(int defaultPageSize) { this.defaultPageSize = defaultPageSize; } /** * Default field used to sort if anyone is specified */ protected String defaultSortField = "releasedate"; /** * Default orientation (Ascending) */ protected boolean defaultSortDescending = false; /** * Default page size */ protected int defaultPageSize = 25; public Map<String, FieldInfo> fields; // document info public int documentHashCode; public AbstractIndexEnvironment(HierarchicalConfiguration indexConfig) { this.indexConfig = indexConfig; populateIndexConfiguration(); // TODO: review this (This is causing one error when you are forcing the // index building on server start, and the lucenes directory is no there // this is not common but it's an error - I will not change this until // we merge ArrayExpress Code with Biosamples (the appraoch can be a // little bit different) setup(); } private void populateIndexConfiguration() { try { this.indexId = this.indexConfig.getString("[@id]"); indexLocationDirectory = this.indexConfig.getString("[@location]"); this.indexDirectory = FSDirectory.open(new File(indexLocationDirectory, this.indexId)); String indexAnalyzer = this.indexConfig.getString("[@defaultAnalyzer]"); Analyzer a = (Analyzer) Class.forName(indexAnalyzer).newInstance(); this.indexAnalyzer = new PerFieldAnalyzerWrapper(a); this.indexDocumentPath = indexConfig.getString("document[@path]"); this.defaultField = indexConfig.getString("document[@defaultField]"); List fieldsConfig = indexConfig.configurationsAt("document.field"); this.fields = new HashMap<String, FieldInfo>(); for (Object fieldConfig : fieldsConfig) { FieldInfo fieldInfo = new FieldInfo((HierarchicalConfiguration) fieldConfig); fields.put(fieldInfo.name, fieldInfo); if (null != fieldInfo.analyzer) { Analyzer fa = (Analyzer) Class.forName(fieldInfo.analyzer).newInstance(); this.indexAnalyzer.addAnalyzer(fieldInfo.name, fa); } } } catch (Exception x) { logger.error("Caught an exception:", x); } } public boolean doesFieldExist(String fieldName) { return fields.containsKey(fieldName); } /* * (non-Javadoc) This is the mains function of this classe and it will * address the query, sort and paging issues * * @see * uk.ac.ebi.arrayexpress.utils.saxon.search.IIndexEnvironment#queryPaged * (java.lang.Integer, uk.ac.ebi.arrayexpress.utils.saxon.search.QueryInfo, * uk.ac.ebi.arrayexpress.utils.HttpServletRequestParameterMap) */ public String queryPaged(Integer queryId, QueryInfo info, HttpServletRequestParameterMap map) throws IOException { // IndexReader ir = null; IndexSearcher isearcher = null; if (logger.isDebugEnabled()) { logger.debug("start of queryPaged"); } StringBuilder totalRes = new StringBuilder(); totalRes.append("<content>"); Query query = info.getQuery(); try { ir = getIndexReader(); if (query instanceof BooleanQuery && ((BooleanQuery) query).clauses().isEmpty()) { logger.info("Empty search, returned all [{}] documents", getCountDocuments()); // this is much more faster query = new MatchAllDocsQuery(); } isearcher = new IndexSearcher(ir); boolean descending = getDefaultSortDescending(); ; String sortBy = StringTools.arrayToString(map.get("sortby"), " "); if (sortBy == null || sortBy.equalsIgnoreCase("")) { sortBy = getDefaultSortField(); } String sortOrder = StringTools.arrayToString(map.get("sortorder"), " "); if (sortOrder != null) { if (sortOrder.equalsIgnoreCase("ascending")) { descending = false; } else { descending = true; } } // I have to test the sort field name. If it is a string i have to // add "sort" to the name // I will only sort if I have a Field // TopDocs hits; ScoreDoc[] hits = null; Sort sort = null; if (doesFieldExist(sortBy)) { FieldInfo sortField = fields.get(sortBy); if (sortField == null) { logger.info("A sort field is trying to be used but that field is not defined! ->[{}]", sortBy); } int sortFieldsSize = sortField.sortFields != null ? sortField.sortFields.size() : 0; SortField[] sortFieldArray = new SortField[sortFieldsSize]; if (sortFieldsSize > 0) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < sortField.sortFields.size(); i++) { FieldInfo otherSortField = fields.get(sortField.sortFields.get(i)); if (otherSortField == null) { logger.info( "Other sort field is trying to be used but that field is not defined! ->[{}]", sortField.sortFields.get(i)); } else { String sortByName = otherSortField.name; int descendingType = SortField.STRING_VAL; sb.append("new sortField ->").append(otherSortField.name).append("; "); if (otherSortField.name.equalsIgnoreCase(sortBy) && otherSortField.type.equalsIgnoreCase("string")) { sortByName += "sort"; } else { if (otherSortField.type.equalsIgnoreCase("integer")) { descendingType = SortField.LONG; } } sortFieldArray[i] = new SortField(sortByName, descendingType, descending); } } logger.debug("Query sorted by: ->[{}] descending: ->[{}]", sb.toString(), descending); } sort = new Sort(sortFieldArray); } // The fiels doenst exist but it can be one of the dynamic, so I // have to add it else { SortField[] sortFieldArray = new SortField[1]; sortBy += "sort"; sortFieldArray[0] = new SortField(sortBy, SortField.STRING_VAL, descending); logger.debug("Query sorted by a dynamic sample attribute: ->[{}] descending: ->[{}]", sortBy, descending); sort = new Sort(sortFieldArray); // logger.info( // "Sort query field [{}] doenst exist or the SortBy parameter was not specified", // sortBy); } int pageSize = defaultPageSize; if (map.containsKey("pagesize")) { pageSize = Integer.parseInt(StringTools.arrayToString(map.get("pagesize"), " ")); } else { pageSize = getDefaultPageSize(); map.put("pagesize", Integer.toString(pageSize)); } int page = 0; if (map.containsKey("page")) { page = Integer.parseInt(StringTools.arrayToString(map.get("page"), " ")) - 1; } int initialExp = page * pageSize; int finalExp = initialExp + pageSize; // I will execute the same query with or without Sortby parameter // (in the last case the sort will be null) // /TopFieldCollector collectorAux = null; TopFieldCollector collector = null; int numHits = getCountDocuments() + 1; collector = TopFieldCollector.create(sort == null ? new Sort() : sort, // / collectorAux = TopFieldCollector.create(sort == null ? new // Sort() // / : sort, // TODO: rpe If im returning page 3 using pagesize of 50 i need to // sort (3*50) (page == 0 ? 1 : page + 1) * pageSize, false, // fillFields // - not // needed, // we want // score and // doc only false, // trackDocScores - need doc and score fields false, // trackMaxScore - related to trackDocScores sort == null); // should docs be in docId order? // /TopFieldCollectorReference collector= new // TopFieldCollectorReference(collectorAux); // facets: // Directory taxDir = FSDirectory.open(new // File("/Users/rpslpereira/Apps/apache-tomcat-6.0.33/temp/Setup/LuceneIndexesFacets/biosamplesgroup")); // DirectoryTaxonomyReader taxor = new // DirectoryTaxonomyReader(taxDir); // FacetSearchParams fsp = new FacetSearchParams(); // fsp.addFacetRequest(new CountFacetRequest(new // CategoryPath("samples"), 10)); // FacetsCollector facetsCollector = new FacetsCollector(fsp, ir, // taxor); // isearcher.search(new MatchAllDocsQuery(), facetsCollector); // for (FacetResult fres : facetsCollector.getFacetResults()) { // FacetResultNode root = fres.getFacetResultNode(); // System.out.println(root.getLabel() + " (" + root.getValue() + // ")"); // for (FacetResultNode cat : root.getSubResults()) { // System.out.println(" " + cat.getLabel().getComponent(1) // + " (" + cat.getValue() + ")"); // } // } isearcher.search(query, collector); // I will use this Collector to know how much results do i have long timeHits = System.nanoTime(); TotalHitCountCollector collector2 = new TotalHitCountCollector(); // /TotalHitCountCollectorReference collector2 = new // TotalHitCountCollectorReference(); isearcher.search(query, collector2); double ms = (System.nanoTime() - timeHits) / 1000000d; logger.info("Number of Docs TotalHitCountCollector->" + collector2.getTotalHits() + "- TOTALHITS TOOK->" + ms); int totalHits = collector2.getTotalHits(); TopDocs topDocs = collector.topDocs(); // hits= topDocs.scoreDocs; hits = topDocs.scoreDocs; logger.info("Search of index [" + this.indexId + "] with query [{}] returned [{}] hits", query.toString(), hits.length); logger.info("Beginning of paging logic"); if (finalExp > hits.length) { finalExp = hits.length; } List<String> combinedTotal = new ArrayList<String>(); combinedTotal.add(String.valueOf(totalHits)); map.put("total", combinedTotal.toArray(new String[combinedTotal.size()])); logger.info("End of paging logic, requesting data from [{}] to [{}]", initialExp, finalExp); long time = System.nanoTime(); if (logger.isDebugEnabled()) { logger.debug("Requesting data from xml database"); } // this QueryDB should be implemented by all subclasses and is // responsible for the data collecting totalRes.append(queryDB(hits, isearcher, initialExp, finalExp, map)); if (logger.isDebugEnabled()) { logger.debug("End of requesting data from xml database"); } isearcher.close(); // /ir.close(); } catch (Exception x) { logger.error("Caught an exception:", x); } finally { if (null != isearcher) isearcher.close(); // if (null != ir) // ir.close(); } totalRes.append("</content>"); if (logger.isDebugEnabled()) { logger.debug("End of QueryPaged"); } return totalRes.toString(); } /** * @param hits * this just represents a subset of the result * @param TotalHits * @param isearcher * @param initialExp * @param finalExp * @param map * @return * @throws Exception */ abstract public String queryDB(ScoreDoc[] hits, IndexSearcher isearcher, int initialExp, int finalExp, HttpServletRequestParameterMap map) throws Exception; /* * (non-Javadoc) * * @see * uk.ac.ebi.arrayexpress.utils.saxon.search.IIndexEnvironment#queryAllDocs * (java.lang.Integer, uk.ac.ebi.arrayexpress.utils.saxon.search.QueryInfo, * uk.ac.ebi.arrayexpress.utils.HttpServletRequestParameterMap) */ public ScoreDoc[] queryAllDocs(Integer queryId, QueryInfo info, HttpServletRequestParameterMap map) throws IOException { IndexReader ir = null; IndexSearcher isearcher = null; Query query = info.getQuery(); ScoreDoc[] hits = null; try { ir = IndexReader.open(this.indexDirectory, true); // empty query returns everything if (query instanceof BooleanQuery && ((BooleanQuery) query).clauses().isEmpty()) { logger.info("queryAllDocs Empty search, returned all [{}] documents", getCountDocuments()); // I need to continue because e i need to sort the data, so I // will create an empty query (this happens when I'm a curator // and I dont have any search criteria) // Term t = new Term(defaultField, "*"); // ((BooleanQuery) query).add(new BooleanClause(new // WildcardQuery( // t), BooleanClause.Occur.SHOULD)); // this is much more faster query = new MatchAllDocsQuery(); } // to show _all_ available nodes isearcher = new IndexSearcher(ir); // +1 is a trick to prevent from having an exception thrown if // documentNodes.size() value is 0 boolean descending = true; String sortBy = StringTools.arrayToString(map.get("sortby"), " "); if (sortBy != null && sortBy.equalsIgnoreCase("")) { sortBy = getDefaultSortField(); } String sortOrder = StringTools.arrayToString(map.get("sortorder"), " "); if (sortOrder != null) { if (sortOrder.equalsIgnoreCase("ascending")) { descending = false; } else { descending = true; } } int sortFieldType = SortField.INT; // I have to test the sort field name. If it is a string i have to // add "sort" to the name // I will only sort if I have a Field Sort sort = null; if (!sortBy.equalsIgnoreCase("") && doesFieldExist(sortBy)) { FieldInfo sortField = fields.get(sortBy); if (sortField == null) { logger.info("A sort field is trying to be used but that field is not defined! ->[{}]", sortBy); } int sortFieldsSize = sortField.sortFields != null ? sortField.sortFields.size() : 0; SortField[] sortFieldArray = new SortField[sortFieldsSize]; // sortFieldArray[0]=new SortField(sortBy, sortFieldType, // descending); if (sortFieldsSize > 0) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < sortField.sortFields.size(); i++) { FieldInfo otherSortField = fields.get(sortField.sortFields.get(i)); if (otherSortField == null) { logger.info( "Other sort field is trying to be used but that field is not defined! ->[{}]", sortField.sortFields.get(i)); } else { String sortByName = otherSortField.name; int descendingType = SortField.STRING_VAL; sb.append("new sortField ->").append(otherSortField.name).append("; "); if (otherSortField.name.equalsIgnoreCase(sortBy) && otherSortField.type.equalsIgnoreCase("string")) { sortByName += "sort"; } else { if (otherSortField.type.equalsIgnoreCase("integer")) { descendingType = SortField.INT; } } sortFieldArray[i] = new SortField(sortByName, descendingType, descending); } } logger.info("Query sorted by: ->[{}]", sb.toString()); } sort = new Sort(sortFieldArray); // hits = isearcher.search(query, getCountDocuments() + 1, // sort); } else { // hits = isearcher.search(query, getCountDocuments() + 1); logger.info("Sort query field [{}] doenst exist or the SortBy parameter was not specified", sortBy); } // I will execute the same query with or without Sortby parameter // (in the last case the sort will be null) int numHits = getCountDocuments() + 1; TopFieldCollector collector = TopFieldCollector.create(sort == null ? new Sort() : sort, numHits, false, // fillFields // - not // needed, // we // want // score // and // doc // only false, // trackDocScores - need doc and score fields false, // trackMaxScore - related to trackDocScores sort == null); // should docs be in docId order? isearcher.search(query, collector); TopDocs topDocs = collector.topDocs(); // hits= topDocs.scoreDocs; hits = topDocs.scoreDocs; map.put("total", Integer.toString(hits.length)); isearcher.close(); ir.close(); } catch (Exception x) { logger.error("Caught an exception:", x); } finally { if (null != isearcher) isearcher.close(); if (null != ir) ir.close(); } return hits; } // TODO RPE public void indexReader() { // IndexReader ir = null; try { logger.info("Reload the Lucene Index for [{}]", indexId); ir = IndexReader.open(this.indexDirectory, true); Map<String, String> map = ir.getCommitUserData(); logger.info("numberDocs->" + map.get("numberDocs")); logger.info("date->" + map.get("date")); logger.info("keyValidator->" + map.get("keyValidator")); this.setCountDocuments(Integer.parseInt(map.get("numberDocs"))); } catch (Exception x) { logger.error("Caught an exception:", x); } finally { try { if (null != ir) { ir.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); logger.error("Caught an exception:", e); } } } public void setup() { // TODO Auto-generated method stub closeIndexReader(); getIndexReader(); // I need to do this, because the setup method is // called when a full reload occurs and we need to // open it again logger.info("default setup for Index Environment"); } public String getMetadataInformation() { String ret = "<table>"; Map<String, String> map = getIndexReader().getCommitUserData(); for (String key : map.keySet()) { ret += "<tr><td valign='top'><u>" + key + "</u></td><td>" + map.get(key) + "</td></tr>"; } ret += "</table>"; return ret; } // no parameters menas that i will do all the work in the default database // (the one that is configured) public void indexFromXmlDB() throws Exception { // String indexLocationDirectory = ""; String dbHost = ""; String dbPassword = ""; String dbName = ""; int dbPort = 0; // get the default location // indexLocationDirectory = this.indexLocationDirectory; HierarchicalConfiguration connsConf = (HierarchicalConfiguration) Application.getInstance().getPreferences() .getConfSubset("bs.xmldatabase"); if (null != connsConf) { // connectionString = connsConf.getString("connectionstring"); dbHost = connsConf.getString("host"); dbPort = Integer.parseInt(connsConf.getString("port")); dbName = connsConf.getString("dbname"); dbPassword = connsConf.getString("adminpassword"); } else { logger.error("bs.xmldatabase Configuration is missing!!"); } indexFromXmlDB(indexLocationDirectory, dbHost, dbPort, dbPassword, dbName); } // TODO: I'm assuming that there is always an attribute @id in each element public void indexFromXmlDB(String indexLocationDirectory, String dbHost, int dbPort, String dbPassword, String dbName) throws Exception { int countNodes = 0; String driverXml = ""; String connectionString = ""; Collection coll; IndexWriter w = null; DirectoryTaxonomyWriter taxoWriter = null; Map<String, XPathExpression> fieldXpe = new HashMap<String, XPathExpression>(); try { Directory indexTempDirectory = FSDirectory.open(new File(indexLocationDirectory, indexId)); logger.debug("Index directory->" + indexLocationDirectory); w = createIndex(indexTempDirectory, indexAnalyzer); Directory taxDir = FSDirectory.open(new File(indexLocationDirectory + "Facets", indexId)); taxoWriter = new DirectoryTaxonomyWriter(taxDir); CategoryDocumentBuilder docBuilder = new CategoryDocumentBuilder(taxoWriter); HierarchicalConfiguration connsConf = (HierarchicalConfiguration) Application.getInstance() .getPreferences().getConfSubset("bs.xmldatabase"); if (null != connsConf) { // TODO: rpe use the component XmlDatabasePooling driverXml = connsConf.getString("driver"); // I will use the connectionString that was passed by parameter // (in several parameters) connectionString = connsConf.getString("base") + "://" + dbHost + ":" + dbPort + "/" + dbName; } else { logger.error("bs.xmldatabase Configuration is missing!!"); } // I cannot register this database again (this is already registered // on XmlDbConnectionPool Component - // java.nio.channels.OverlappingFileLockException // c = Class.forName(driverXml); // db = (Database) c.newInstance(); // DatabaseManager.registerDatabase(db); logger.debug("connectionString->" + connectionString); coll = DatabaseManager.getCollection(connectionString); XPathQueryService service = (XPathQueryService) coll.getService("XPathQueryService", "1.0"); DocumentInfo source = null; // Loop through all result items // collect all the fields data Configuration config = ((SaxonEngine) Application.getAppComponent("SaxonEngine")).trFactory .getConfiguration(); XPath xp = new XPathEvaluator(config); // XPathExpression xpe = xp.compile(this.env.indexDocumentPath); for (FieldInfo field : fields.values()) { fieldXpe.put(field.name, xp.compile(field.path)); logger.debug("Field Path->[{}]", field.path); } // the xmldatabase is not very correct and have memory problem for // queires with huge results, so its necessary to implement our own // iteration mechanism // // // I will collect all the results // ResourceSet set = service.query(this.env.indexDocumentPath); // //TODO rpe // //ResourceSet set = service.query("//Sample"); // logger.debug("Number of results->" + set.getSize()); // long numberResults = set.getSize(); long numberResults = 0; ResourceSet set = service.query("count(" + indexDocumentPath + ")"); if (set.getIterator().hasMoreResources()) { numberResults = Integer.parseInt((String) set.getIterator().nextResource().getContent()); } logger.debug("Number of results->" + numberResults); long pageSizeDefault = 50000; // the samplegroup cannot be big otherwise I will obtain a memory // error ... but the sample must b at least one million because the // paging queries are really slow - we need to balance it // (for samples 1million, for samplegroup 50000) if (numberResults > 1000000) { pageSizeDefault = 1000000; } long pageNumber = 1; int count = 0; // Map<String, AttsInfo[]> cacheAtt = new HashMap<String, // AttsInfo[]>(); // Map<String, XPathExpression> cacheXpathAtt = new HashMap<String, // XPathExpression>(); // Map<String, XPathExpression> cacheXpathAttValue = new // HashMap<String, XPathExpression>(); while ((pageNumber * pageSizeDefault) <= (numberResults + pageSizeDefault - 1)) { // while ((pageNumber<=1)) { // calculate the last hit long pageInit = (pageNumber - 1) * pageSizeDefault + 1; long pageSize = (pageNumber * pageSizeDefault < numberResults) ? pageSizeDefault : (numberResults - pageInit + 1); service = (XPathQueryService) coll.getService("XPathQueryService", "1.0"); // xquery paging using subsequence function long time = System.nanoTime(); // /set = // service.query("for $x in(/Biosamples/SampleGroup/Sample/@id) return string($x)"); // I'm getting everything based on nodeId, because i have the // sample sample in different samplegroups // TODO: change this (just works with baseX) set = service.query("for $x in(subsequence(" + indexDocumentPath + "," + pageInit + "," + pageSize + ")) return db:node-id($x)"); // logger.debug("Number of results of page->" + set.getSize()); double ms = (System.nanoTime() - time) / 1000000d; logger.info("Query XMLDB took ->[{}]", ms); ResourceIterator iter = set.getIterator(); XPath xp2; XPathExpression xpe2; List documentNodes; StringReader reader; // cache of distinct attributes fora each sample group while (iter.hasMoreResources()) { count++; logger.debug("its beeing processed the number ->" + count); String idNode = (String) iter.nextResource().getContent(); //logger.debug("Id node->" + idNode); // I need to get the sample // ResourceSet setid = service.query(indexDocumentPath // + "[@id='" + idSample + "']"); ResourceSet setid = service.query("db:open-id('" + dbName + "'," + idNode + ")"); ResourceIterator iterid = setid.getIterator(); List<CategoryPath> sampleCategories = null; while (iterid.hasMoreResources()) { // System.out.println(""); // /xml=(String) iterid.nextResource().getContent(); // /xml=(String) iter.nextResource().getContent(); // /reader = new StringReader(xml); StringBuilder xml = new StringBuilder(); xml.append((String) iterid.nextResource().getContent()); // logger.debug("xml->"+xml); // logger.debug(xml.toString()); reader = new StringReader(xml.toString()); source = config.buildDocument(new StreamSource(reader)); // logger.debug("XML DB->[{}]", // PrintUtils.printNodeInfo((NodeInfo) source, config)); Document d = new Document(); xp2 = new XPathEvaluator(source.getConfiguration()); int position = indexDocumentPath.lastIndexOf("/"); ; String pathRoot = ""; if (position != -1) { pathRoot = indexDocumentPath.substring(position); } else { pathRoot = indexDocumentPath; } // logger.debug("PathRoot->[{}]",pathRoot); xpe2 = xp2.compile(pathRoot); // TODO rpe // xpe2 = xp2.compile("/Sample"); documentNodes = (List) xpe2.evaluate(source, XPathConstants.NODESET); for (Object node : documentNodes) { try { d = processEntryIndex(node, config, service, fieldXpe); } catch (Exception x) { String xmlError = PrintUtils.printNodeInfo((NodeInfo) node, config); logger.error("XML that was being processed when the error occurred DB->[{}]", xmlError); // to avoid the next running to stop // because its not able to delete the // newSetup directory w.close(); throw new Exception("Xml that is being processed:" + xmlError, x); } } documentNodes = null; source = null; reader = null; xml = null; countNodes++; // logger.debug("count->[{}]", countNodes); // facet tests docBuilder.setCategoryPaths(sampleCategories); docBuilder.build(d); addIndexDocument(w, d); } } logger.debug("until now it were processed->[{}]", pageNumber * pageSizeDefault); pageNumber++; if (coll != null) { try { // coll.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } set = null; } setCountDocuments(countNodes); // add metadata to the lucene index Map<String, String> map = new HashMap<String, String>(); map.put("numberDocs", Integer.toString(countNodes)); map.put("date", Long.toString(System.nanoTime())); // logger.debug(Application.getInstance().getComponent("XmlDbConnectionPool").getMetaDataInformation()); // I cannot call directly // getComponent("XmlDbConnectionPool").getMetaDataInformation(), // because I can be working in a did String dbInfo = ((XmlDbConnectionPool) Application.getInstance().getComponent("XmlDbConnectionPool")) .getDBInfo(dbHost, dbPort, dbPassword, dbName); map.put("DBInfo", dbInfo); // facet taxoWriter.commit(); taxoWriter.close(); commitIndex(w, map); } catch (Exception x) { logger.error("Caught an exception:", x); taxoWriter.close(); w.close(); throw x; } } public void indexIncrementalFromXmlDB() throws Exception { String indexLocationDirectory = ""; String dbHost = ""; String dbPassword = ""; String dbName = ""; int dbPort = 0; // get the default location indexLocationDirectory = indexLocationDirectory; HierarchicalConfiguration connsConf = (HierarchicalConfiguration) Application.getInstance().getPreferences() .getConfSubset("bs.xmldatabase"); if (null != connsConf) { // connectionString = connsConf.getString("connectionstring"); dbHost = connsConf.getString("host"); dbPort = Integer.parseInt(connsConf.getString("port")); dbName = connsConf.getString("dbname"); dbPassword = connsConf.getString("adminpassword"); } else { logger.error("bs.xmldatabase Configuration is missing!!"); } indexIncrementalFromXmlDB(indexLocationDirectory, dbHost, dbPort, dbPassword, dbName); } // TODO: I'm assuming that there is always an attribute @id in each element public void indexIncrementalFromXmlDB(String indexLocationDirectory, String dbHost, int dbPort, String dbPassword, String dbName) throws Exception { // I'm upgrading so the baseline is the current nodes number int countNodes = getCountDocuments(); String driverXml = ""; String connectionString = ""; Collection coll; IndexWriter w = null; Map<String, XPathExpression> fieldXpe = new HashMap<String, XPathExpression>(); logger.info("indexIncrementalFromXmlDB(generic) is starting for [{}], and initially I have[{}] ... ", new Object[] { indexId, countNodes }); try { Directory indexTempDirectory = FSDirectory.open(new File(indexLocationDirectory, indexId)); w = openIndex(indexTempDirectory, indexAnalyzer); HierarchicalConfiguration connsConf = (HierarchicalConfiguration) Application.getInstance() .getPreferences().getConfSubset("bs.xmldatabase"); if (null != connsConf) { driverXml = connsConf.getString("driver"); connectionString = connsConf.getString("base") + "://" + dbHost + ":" + dbPort + "/" + dbName; } else { logger.error("bs.xmldatabase Configuration is missing!!"); } logger.debug("connectionString->" + connectionString); coll = DatabaseManager.getCollection(connectionString); XPathQueryService service = (XPathQueryService) coll.getService("XPathQueryService", "1.0"); DocumentInfo source = null; Configuration config = ((SaxonEngine) Application.getAppComponent("SaxonEngine")).trFactory .getConfiguration(); XPath xp = new XPathEvaluator(config); for (FieldInfo field : fields.values()) { fieldXpe.put(field.name, xp.compile(field.path)); logger.debug("Field Path->[{}]", field.path); } // the xmldatabase is not very correct and have memory problem for // queires with huge results, so its necessary to implement our own // iteration mechanism // // // I will collect all the results // ResourceSet set = service.query(this.env.indexDocumentPath); long numberResults = 0; ResourceSet set = service.query("count(" + indexDocumentPath + ")"); if (set.getIterator().hasMoreResources()) { numberResults = Integer.parseInt((String) set.getIterator().nextResource().getContent()); } // TODO:######################################Change this after - // this is just a performance test // float percentage=0.1F; // numberResults=Math.round(numberResults * percentage); logger.debug("Number of results->" + numberResults); long pageSizeDefault = 50000; if (numberResults > 1000000) { pageSizeDefault = 1000000; } long pageNumber = 1; int count = 0; Map<String, AttsInfo[]> cacheAtt = new HashMap<String, AttsInfo[]>(); Map<String, XPathExpression> cacheXpathAtt = new HashMap<String, XPathExpression>(); Map<String, XPathExpression> cacheXpathAttValue = new HashMap<String, XPathExpression>(); while ((pageNumber * pageSizeDefault) <= (numberResults + pageSizeDefault - 1)) { // calculate the last hit long pageInit = (pageNumber - 1) * pageSizeDefault + 1; long pageSize = (pageNumber * pageSizeDefault < numberResults) ? pageSizeDefault : (numberResults - pageInit + 1); service = (XPathQueryService) coll.getService("XPathQueryService", "1.0"); // xquery paging using subsequence function long time = System.nanoTime(); // TODO: I'm assuming that there is always an attribute @id in // each element set = service.query("for $x in(subsequence(" + indexDocumentPath + "/@id," + pageInit + "," + pageSize + ")) return string($x)"); double ms = (System.nanoTime() - time) / 1000000d; logger.info("Query XMLDB took ->[{}]", ms); ResourceIterator iter = set.getIterator(); XPath xp2; XPathExpression xpe2; List documentNodes; StringReader reader; // cache of distinct attributes fora each sample group while (iter.hasMoreResources()) { count++; logger.debug("its beeing processed the number ->" + count); String idToProcess = (String) iter.nextResource().getContent(); logger.debug("@id that is being processed->" + idToProcess); // I need to get the sample ResourceSet setid = service.query(indexDocumentPath + "[@id='" + idToProcess + "']"); ResourceIterator iterid = setid.getIterator(); while (iterid.hasMoreResources()) { StringBuilder xml = new StringBuilder(); xml.append((String) iterid.nextResource().getContent()); // logger.debug(xml.toString()); reader = new StringReader(xml.toString()); source = config.buildDocument(new StreamSource(reader)); // logger.debug("XML DB->[{}]", // PrintUtils.printNodeInfo((NodeInfo) source, config)); Document d = new Document(); xp2 = new XPathEvaluator(source.getConfiguration()); int position = indexDocumentPath.lastIndexOf("/"); // TODO: I also need to change this String pathRoot = ""; if (position != -1) { pathRoot = indexDocumentPath.substring(position); } else { pathRoot = indexDocumentPath; } // logger.debug("PathRoot->[{}]",pathRoot); xpe2 = xp2.compile(pathRoot); documentNodes = (List) xpe2.evaluate(source, XPathConstants.NODESET); for (Object node : documentNodes) { // logger.debug("XML DB->[{}]",PrintUtils.printNodeInfo((NodeInfo)node,config)); String idElement = (String) fieldXpe.get("id").evaluate(node, XPathConstants.STRING); // I need to see if it already exists // I will also add this document if it is nor marked // as "todelete" Boolean toDelete = (Boolean) fieldXpe.get("delete").evaluate(node, XPathConstants.BOOLEAN); // TODO:######################################Change // this after - this is just a performance test int deletePercentage = 10; toDelete = (count % deletePercentage) == 0 ? true : false; logger.debug( "Incremental Update - The document [{}] is being processed and is marked to delete?[{}]", new Object[] { idElement, toDelete }); // I will always try to delete the document (i don't // know if it is new or if it was changed) Term idTerm = new Term("id", idElement.toLowerCase()); int countToDelete = getIndexReader().docFreq(idTerm); if (countToDelete > 0) { // if has more than one, I have to send an email // to warn if (countToDelete > 1) { Application.getInstance().sendEmail(null, null, "BIOSAMPLES ERROR - Incremental Update - Removing more than one document! id-> " + idElement, " documents found:" + countToDelete); // I will launch an exception throw new Exception( "BIOSAMPLES ERROR - Incremental Update - Removing more than one document in incremental update id-> " + idElement + " documents found:" + countToDelete); } logger.debug("The document with id [{}] is being deleted from Lucene", idElement); w.deleteDocuments(idTerm); // need to remove one from the number of // documents count countNodes--; } // the element doesn't exist on GUI else { // if it is marked to delete I will just an // warning email - it's possible that it was // inserted and deleted on the Backend but it // had never been sent to the GUI before if (toDelete) { Application.getInstance().sendEmail(null, null, "BIOSAMPLES WARNING - Incremental Update - Id marked for deletion but the id doesn't exist on the GUI! id-> " + idElement, ""); } } // if (toDelete) { // logger.debug( // "The document with id [{}] was marked to deletion so I will not process it", // idElement); // } else { // I just process it is it is not for deletion) if (!toDelete) { try { d = processEntryIndex(node, config, service, fieldXpe); } catch (Exception x) { String xmlError = PrintUtils.printNodeInfo((NodeInfo) node, config); logger.error("XML that was being processed when the error occurred DB->[{}]", xmlError); // to avoid the next running to stop // because its not able to delete the // newSetup directory w.close(); throw new Exception("Xml that is being processed:" + xmlError, x); } countNodes++; addIndexDocument(w, d); } } // } documentNodes = null; source = null; reader = null; xml = null; // logger.debug("count->[{}]", countNodes); } } logger.debug("until now it were processed->[{}]", pageNumber * pageSizeDefault); pageNumber++; // if (coll != null) { // try { // // coll.close(); // } catch (Exception e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // } set = null; } setCountDocuments(countNodes); // add metadata to the lucene index Map<String, String> map = new HashMap<String, String>(); map.put("numberDocs", Integer.toString(countNodes)); map.put("date", Long.toString(System.nanoTime())); // logger.debug(Application.getInstance().getComponent("XmlDbConnectionPool").getMetaDataInformation()); // I cannot call directly // getComponent("XmlDbConnectionPool").getMetaDataInformation(), // because I can be working in a did String dbInfo = ((XmlDbConnectionPool) Application.getInstance().getComponent("XmlDbConnectionPool")) .getDBInfo(dbHost, dbPort, dbPassword, dbName); // TODO: I need to put here what I have before - to track all the // changes (old numberDocs + old date + oldDBInfo) map.put("DBInfo", dbInfo + "<BR>##################################################<BR>" + getMetadataInformation()); commitIndex(w, map); } catch (Exception x) { logger.error("Caught an exception:", x); w.close(); throw x; } } IndexWriter createIndex(Directory indexDirectory, Analyzer analyzer) throws Exception { IndexWriter iwriter = null; try { iwriter = new IndexWriter(indexDirectory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); // TODO: just to check if it solves the slowly indexing indexes with // more iwriter.setMaxBufferedDocs(500000); } catch (Exception x) { logger.error("Caught an exception:", x); throw x; } return iwriter; } IndexWriter openIndex(Directory indexDirectory, Analyzer analyzer) { IndexWriter iwriter = null; try { iwriter = new IndexWriter(indexDirectory, analyzer, false, IndexWriter.MaxFieldLength.UNLIMITED); // TODO: just to check if it solves the slowly indexing indexes with // more iwriter.setMaxBufferedDocs(500000); } catch (Exception x) { logger.error("Caught an exception:", x); } return iwriter; } void addIndexField(Document document, String name, Object value, boolean shouldAnalyze, boolean shouldStore, boolean sort) { String stringValue; if (value instanceof String) { stringValue = (String) value; } else if (value instanceof NodeInfo) { stringValue = ((NodeInfo) value).getStringValue(); } else { stringValue = value.toString(); logger.warn( "Not sure if I handle string value of [{}] for the field [{}] correctly, relying on Object.toString()", value.getClass().getName(), name); } // TODO // logger.debug("value->[{}]", stringValue); document.add(new Field(name, stringValue, shouldStore ? Field.Store.YES : Field.Store.NO, shouldAnalyze ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED)); // ig Im indexing a String and the @sort=true I will always create a new // field (fieldname+sort) if (sort) { String newF = name + "sort"; document.add(new Field(newF, stringValue, Field.Store.NO, Field.Index.NOT_ANALYZED)); } } void addBooleanIndexField(Document document, String name, Object value, boolean sort) { Boolean boolValue = null; if (value instanceof Boolean) { boolValue = (Boolean) value; } else if (null != value) { String stringValue = value.toString(); boolValue = StringTools.stringToBoolean(stringValue); logger.warn( "Not sure if I handle string value [{}] for the field [{}] correctly, relying on Object.toString()", stringValue, name); } // TODO // logger.debug("value->[{}]", boolValue.toString()); if (!sort) { document.add(new Field(name, null == boolValue ? "" : boolValue.toString(), Field.Store.NO, Field.Index.NOT_ANALYZED)); } else { document.add(new Field(name, null == boolValue ? "" : boolValue.toString(), Field.Store.YES, Field.Index.NOT_ANALYZED)); } } void addIntIndexField(Document document, String name, Object value, boolean store, boolean sort) { Long longValue = null; if (value instanceof BigInteger) { longValue = ((BigInteger) value).longValue(); } else if (value instanceof NodeInfo) { longValue = Long.parseLong(((NodeInfo) value).getStringValue()); } else { logger.warn( "Not sure if I handle long value of [{}] for the field [{}] correctly, relying on Object.toString()", value.getClass().getName(), name); } // TODO // logger.debug( "field [{}] value->[{}]", name, longValue.toString()); // logger.debug( "field [{}] store->[{}]", name, store); // logger.debug( "field [{}] sort->[{}]", name, sort); if (null != longValue) { // its more clear to divide the if statement in 3 parts if (sort) { // It has to be int because of sorting (otherwise the error: // Invalid shift value in prefixCoded string (is encoded value // really an INT?)) document.add(new NumericField(name, Field.Store.YES, true).setLongValue(longValue)); } else { if (!store) { document.add(new NumericField(name).setLongValue(longValue)); } else { document.add(new NumericField(name, Field.Store.YES, true).setLongValue(longValue)); } } } else { logger.warn("Long value of the field [{}] was null", name); } } void addIndexDocument(IndexWriter iwriter, Document document) throws Exception { try { iwriter.addDocument(document); } catch (Exception x) { logger.error("Caught an exception:", x); throw x; } } void commitIndex(IndexWriter iwriter) { try { iwriter.optimize(); iwriter.commit(); iwriter.close(); } catch (Exception x) { logger.error("Caught an exception:", x); } } void commitIndex(IndexWriter iwriter, Map<String, String> map) { try { iwriter.optimize(); iwriter.commit(map); iwriter.close(); } catch (Exception x) { logger.error("Caught an exception:", x); } } // process each document (this has all the logic related with dynamic // attributes) public Document processEntryIndex(Object node, Configuration config, XPathQueryService service, Map<String, XPathExpression> fieldXpe) throws Exception { Document luceneDoc = new Document(); XPath xp = new XPathEvaluator(config); for (FieldInfo field : fields.values()) { try { if (!field.process) { List values = (List) fieldXpe.get(field.name).evaluate(node, XPathConstants.NODESET); for (Object v : values) { if ("integer".equals(field.type)) { addIntIndexField(luceneDoc, field.name, v, field.shouldStore, field.shouldSort); } else if ("date".equals(field.type)) { // todo: // addDateIndexField(d, // field.name, // v); logger.error("Date fields are not supported yet, field [{}] will not be created", field.name); } else if ("boolean".equals(field.type)) { addBooleanIndexField(luceneDoc, field.name, v, field.shouldSort); } else { addIndexField(luceneDoc, field.name, v, field.shouldAnalyze, field.shouldStore, field.shouldSort); } } } else { if (field.name.equalsIgnoreCase("attributes")) { // implement here the biosamples // database sample attributes // logic // TODO: rpe // logger.debug("There is A special treatment for this field->" // + field.name); List values = (List) fieldXpe.get(field.name).evaluate(node, XPathConstants.NODESET); for (Iterator iterator = values.iterator(); iterator.hasNext();) { Object object = (Object) iterator.next(); // logger.debug("attributes->" + object); String valClass = (String) fieldXpe.get("attributeName").evaluate(object, XPathConstants.STRING); //TODO: document this on trac and on website documentations help valClass = valClass.replace(" ", "_").toLowerCase(); //valClass=valClass.toLowerCase(); String valType = (String) fieldXpe.get("attributeType").evaluate(object, XPathConstants.STRING); String valValue = (String) fieldXpe.get("attributeValue").evaluate(object, XPathConstants.STRING); if (!valType.equalsIgnoreCase("integer") && !valType.equalsIgnoreCase("real")) { //TODO: change this value valValue = valValue.substring(0, Math.min(valValue.length(), 25)); addIndexField(luceneDoc, "attributes", "=" + valClass + "= " + valValue, true, false, true); } else { valValue = valValue.trim(); int val = 0; if (valValue == null || valValue.equalsIgnoreCase("") || valValue.equalsIgnoreCase("NaN")) { valValue = "0"; } BigDecimal num = new BigDecimal(valValue); num = num.multiply(new BigDecimal(100)); int taux = num.toBigInteger().intValue(); valValue = String.format("%07d", taux); //I need to mantain the spaces for lucene consider different words addIndexField(luceneDoc, "attributes", "=" + valClass + "= " + valValue, true, false, true); } // logger.debug("@class->" + valClass); // logger.debug("@type->" + valType); // logger.debug("text->" + valValue); } } else { // logger.debug("There is NO special treatment for this field->" // + field.name); } } } catch (XPathExpressionException x) { String xmlError = PrintUtils.printNodeInfo((NodeInfo) node, config); logger.error("Field being processed->[{}]", field.name); xmlError = "##FIELD BEING PROCESSED##->" + field.name + "\n" + xmlError; logger.error("XPathExpressionException->[{}]", x.getMessage()); logger.error("Caught an exception while indexing expression [" + field.path + "] for document [" + ((NodeInfo) node).getStringValue().substring(0, 20) + "...]", x); throw new Exception("XPathExpressionException Xml:" + xmlError, x); } catch (Exception xe) { String xmlError = PrintUtils.printNodeInfo((NodeInfo) node, config); logger.error("Generic Exception->[{}]", xe.getMessage()); throw new Exception("Generic Exception Xml:" + xmlError, xe); } } return luceneDoc; } // TODO: I'm assuming that there is always an attribute @id in each element public void indexFromXmlDB_FACETS(String indexLocationDirectory, String dbHost, int dbPort, String dbPassword, String dbName) throws Exception { int countNodes = 0; String driverXml = ""; String connectionString = ""; Collection coll; IndexWriter w = null; Map<String, XPathExpression> fieldXpe = new HashMap<String, XPathExpression>(); try { Directory indexTempDirectory = FSDirectory.open(new File(indexLocationDirectory, indexId)); w = createIndex(indexTempDirectory, indexAnalyzer); Directory taxDir = FSDirectory.open(new File(indexLocationDirectory + "Facets", indexId)); DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxDir); CategoryDocumentBuilder docBuilder = new CategoryDocumentBuilder(taxoWriter); HierarchicalConfiguration connsConf = (HierarchicalConfiguration) Application.getInstance() .getPreferences().getConfSubset("bs.xmldatabase"); if (null != connsConf) { // TODO: rpe use the component XmlDatabasePooling driverXml = connsConf.getString("driver"); // I will use the connectionString that was passed by parameter // (in several parameters) connectionString = connsConf.getString("base") + "://" + dbHost + ":" + dbPort + "/" + dbName; } else { logger.error("bs.xmldatabase Configuration is missing!!"); } // I cannot register this database again (this is already registered // on XmlDbConnectionPool Component - // java.nio.channels.OverlappingFileLockException // c = Class.forName(driverXml); // db = (Database) c.newInstance(); // DatabaseManager.registerDatabase(db); logger.debug("connectionString->" + connectionString); coll = DatabaseManager.getCollection(connectionString); XPathQueryService service = (XPathQueryService) coll.getService("XPathQueryService", "1.0"); DocumentInfo source = null; // Loop through all result items // collect all the fields data Configuration config = ((SaxonEngine) Application.getAppComponent("SaxonEngine")).trFactory .getConfiguration(); XPath xp = new XPathEvaluator(config); // XPathExpression xpe = xp.compile(this.env.indexDocumentPath); for (FieldInfo field : fields.values()) { fieldXpe.put(field.name, xp.compile(field.path)); logger.debug("Field Path->[{}]", field.path); } // the xmldatabase is not very correct and have memory problem for // queires with huge results, so its necessary to implement our own // iteration mechanism // // // I will collect all the results // ResourceSet set = service.query(this.env.indexDocumentPath); // //TODO rpe // //ResourceSet set = service.query("//Sample"); // logger.debug("Number of results->" + set.getSize()); // long numberResults = set.getSize(); long numberResults = 0; ResourceSet set = service.query("count(" + indexDocumentPath + ")"); if (set.getIterator().hasMoreResources()) { numberResults = Integer.parseInt((String) set.getIterator().nextResource().getContent()); } logger.debug("Number of results->" + numberResults); long pageSizeDefault = 50000; // the samplegroup cannot be big otherwise I will obtain a memory // error ... but the sample must b at least one million because the // paging queries are really slow - we need to balance it // (for samples 1million, for samplegroup 50000) if (numberResults > 1000000) { pageSizeDefault = 1000000; } long pageNumber = 1; int count = 0; Map<String, AttsInfo[]> cacheAtt = new HashMap<String, AttsInfo[]>(); Map<String, XPathExpression> cacheXpathAtt = new HashMap<String, XPathExpression>(); Map<String, XPathExpression> cacheXpathAttValue = new HashMap<String, XPathExpression>(); while ((pageNumber * pageSizeDefault) <= (numberResults + pageSizeDefault - 1)) { // while ((pageNumber<=1)) { // calculate the last hit long pageInit = (pageNumber - 1) * pageSizeDefault + 1; long pageSize = (pageNumber * pageSizeDefault < numberResults) ? pageSizeDefault : (numberResults - pageInit + 1); service = (XPathQueryService) coll.getService("XPathQueryService", "1.0"); // xquery paging using subsequence function long time = System.nanoTime(); // /set = // service.query("for $x in(/Biosamples/SampleGroup/Sample/@id) return string($x)"); set = service.query("for $x in(subsequence(" + indexDocumentPath + "/@id," + pageInit + "," + pageSize + ")) return string($x)"); // logger.debug("Number of results of page->" + set.getSize()); double ms = (System.nanoTime() - time) / 1000000d; logger.info("Query XMLDB took ->[{}]", ms); ResourceIterator iter = set.getIterator(); XPath xp2; XPathExpression xpe2; List documentNodes; StringReader reader; // cache of distinct attributes fora each sample group while (iter.hasMoreResources()) { count++; logger.debug("its beeing processed the number ->" + count); String idSample = (String) iter.nextResource().getContent(); logger.debug("idSample->" + idSample); // I need to get the sample ResourceSet setid = service.query(indexDocumentPath + "[@id='" + idSample + "']"); // System.out.println("/Biosamples/SampleGroup/Sample[@id='" // + idSample + "']"); ResourceIterator iterid = setid.getIterator(); List<CategoryPath> sampleCategories = null; while (iterid.hasMoreResources()) { // System.out.println(""); // /xml=(String) iterid.nextResource().getContent(); // /xml=(String) iter.nextResource().getContent(); // logger.debug("xml->"+xml); // /reader = new StringReader(xml); StringBuilder xml = new StringBuilder(); xml.append((String) iterid.nextResource().getContent()); // logger.debug(xml.toString()); reader = new StringReader(xml.toString()); source = config.buildDocument(new StreamSource(reader)); // logger.debug("XML DB->[{}]", // PrintUtils.printNodeInfo((NodeInfo) source, config)); Document d = new Document(); xp2 = new XPathEvaluator(source.getConfiguration()); int position = indexDocumentPath.lastIndexOf("/"); ; String pathRoot = ""; if (position != -1) { pathRoot = indexDocumentPath.substring(position); } else { pathRoot = indexDocumentPath; } // logger.debug("PathRoot->[{}]",pathRoot); xpe2 = xp2.compile(pathRoot); // TODO rpe // xpe2 = xp2.compile("/Sample"); documentNodes = (List) xpe2.evaluate(source, XPathConstants.NODESET); for (Object node : documentNodes) { // logger.debug("XML DB->[{}]",PrintUtils.printNodeInfo((NodeInfo)node,config)); for (FieldInfo field : fields.values()) { try { // Configuration // config=doc.getConfiguration(); // I Just have to calculate the Xpath if (!field.process) { List values = (List) fieldXpe.get(field.name).evaluate(node, XPathConstants.NODESET); // logger.debug("Field->[{}] values-> [{}]", // field.name, // values.toString()); for (Object v : values) { if ("integer".equals(field.type)) { addIntIndexField(d, field.name, v, field.shouldStore, field.shouldSort); // Just to test I will put here // one facet for the samples if (field.name.equalsIgnoreCase("samples")) { System.out.println("Value-->" + v.toString()); sampleCategories = new ArrayList<CategoryPath>(); sampleCategories.add(new CategoryPath("samples", v.toString())); } } else if ("date".equals(field.type)) { // todo: addDateIndexField(d, // field.name, // v); logger.error( "Date fields are not supported yet, field [{}] will not be created", field.name); } else if ("boolean".equals(field.type)) { addBooleanIndexField(d, field.name, v, field.shouldSort); } else { addIndexField(d, field.name, v, field.shouldAnalyze, field.shouldStore, field.shouldSort); } } } else { if (field.name.equalsIgnoreCase("attributes")) { // implement here the biosamples // database sample attributes logic // TODO: rpe // logger.debug("There is A special treatment for this field->" // + field.name); List values = (List) fieldXpe.get(field.name).evaluate(node, XPathConstants.NODESET); // XPathExpression // classAtt=xp.compile("@class"); // XPathExpression // typeAtt=xp.compile("@dataType"); // XPathExpression // valueAtt=xp.compile("value"); String groupId = (String) fieldXpe.get("samplegroup").evaluate(node, XPathConstants.STRING); String id = (String) fieldXpe.get("accession").evaluate(node, XPathConstants.STRING); // logger.debug(groupId+"$$$" + id); // logger.debug("Field->[{}] values-> [{}]", // field.name, // values.toString()); AttsInfo[] attsInfo = null; if (cacheAtt.containsKey(groupId)) { attsInfo = cacheAtt.get(groupId); } else { logger.debug("No exists cache for samplegroup->" + groupId); // ResourceSet setAtt = // service.query("distinct-values(/Biosamples/SampleGroup[@id='" // + groupId + // "']/Sample/attribute[@dataType!='INTEGER']/replace(@class,' ', '-'))"); // /ResourceSet setAtt = // service.query("distinct-values(/Biosamples/SampleGroup[@id='" // + groupId + // "']/Sample/attribute/replace(@class,' ', '-'))"); // /ResourceSet setAtt = // service.query("distinct-values(/Biosamples/SampleGroup[@id='" // + groupId + // "']/Sample/attribute/@class)"); ResourceSet setAtt = service .query("data(/Biosamples/SampleGroup[@id='" + groupId + "']/SampleAttributes/attribute/@class)"); // logger.debug("->" // + // "/Biosamples/SampleGroup[@id='" // + groupId + // "']/SampleAttributes/attribute/@class"); ResourceIterator resAtt = setAtt.getIterator(); int i = 0; attsInfo = new AttsInfo[(int) setAtt.getSize()]; while (resAtt.hasMoreResources()) { String classValue = (String) resAtt.nextResource().getContent(); // logger.debug("->" // + classValue); // need to use this because // of the use of quotes in // the name of the classes String classValueWitoutQuotes = classValue.replaceAll("\"", "\"\""); // logger.debug("Class value->" // + classValue); XPathExpression xpathAtt = null; XPathExpression xpathAttValue = null; if (cacheXpathAtt.containsKey(classValue)) { xpathAtt = cacheXpathAtt.get(classValue); xpathAttValue = cacheXpathAttValue.get(classValue); } else { xpathAtt = xp.compile("./attribute[@class=\"" + classValueWitoutQuotes + "\"]/@dataType"); xpathAttValue = xp.compile( "attribute[@class=\"" + classValueWitoutQuotes + "\"]/value/text()[last()]"); // logger.debug("attribute[@class=\"" // + // classValueWitoutQuotes // + // "\"]//value/text()"); // //xpathAttValue=xp.compile("./attribute[@class=\"" // + // classValueWitoutQuotes // + // "\"]/value[1]/text()"); // logger.debug("./attribute[@class=\"" // + // classValueWitoutQuotes // + // "\"]/value[1]/text()"); cacheXpathAtt.put(classValue, xpathAtt); cacheXpathAttValue.put(classValue, xpathAttValue); } // this doesnt work when the // first sample of sample // group doens have all the // attributes // im using \" becuse there // are some attributes thas // has ' on the name!!! // /ResourceSet setAttType = // service.query("string((/Biosamples/SampleGroup[@id='" // + groupId // +"']/Sample/attribute[@class=replace(\"" // + classValueWitoutQuotes // + // "\",'-',' ')]/@dataType)[1])"); // /ResourceSet setAttType = // service.query("string(/Biosamples/SampleGroup[@id='" // + groupId // +"']/Sample/attribute[@class=\"" // + classValueWitoutQuotes // + "\"]/@dataType)"); ResourceSet setAttType = service .query("data(/Biosamples/SampleGroup[@id='" + groupId + "']/SampleAttributes/attribute[@class=\"" + classValueWitoutQuotes + "\"]/@dataType)"); String dataValue = (String) setAttType.getIterator() .nextResource().getContent(); // logger.debug("Data Type of " // + classValue + " ->" + // dataValue); // String // dataValue=(String)xpathAtt.evaluate(node, // XPathConstants.STRING); AttsInfo attsI = new AttsInfo(classValue, dataValue); // logger.debug("Atttribute->class" // + attsI.name + "->type->" // + attsI.type + "->i" + // i); attsInfo[i] = attsI; // logger.debug("distinct att->" // + value); // cacheAtt.put(groupId, // value); i++; } cacheAtt.put(groupId, attsInfo); // distinctAtt=cacheAtt.get(groupId); // logger.debug("Already exists->" // + distinctAtt); } int len = attsInfo.length; for (int i = 0; i < len; i++) { // logger.debug("$$$$$$->" + // attsInfo[i].name + "$$$$" + // attsInfo[i].type); if (!attsInfo[i].type.equalsIgnoreCase("integer") && !attsInfo[i].type.equalsIgnoreCase("real")) { XPathExpression valPath = cacheXpathAttValue .get(attsInfo[i].name); String val = (String) valPath.evaluate(node, XPathConstants.STRING); // logger.debug("$$$$$$->" + // "STRING->" + val + ""); addIndexField(d, (i + 1) + "", val, true, false, true); } else { XPathExpression valPath = cacheXpathAttValue .get(attsInfo[i].name); String valS = (String) valPath.evaluate(node, XPathConstants.STRING); valS = valS.trim(); // logger.debug("Integer->" // + valS); int val = 0; if (valS == null || valS.equalsIgnoreCase("") || valS.equalsIgnoreCase("NaN")) { valS = "0"; } // sort numbers as strings // logger.debug("class->" + // attsInfo[i].name // +"value->##"+ valS + // "##"); BigDecimal num = new BigDecimal(valS); num = num.multiply(new BigDecimal(100)); int taux = num.toBigInteger().intValue(); valS = String.format("%07d", taux); // logger.debug("Integer->" // + valS + "position->" // +(i+1)+"integer"); addIndexField(d, (i + 1) + "", valS, true, false, true); // addIntIndexField(d, // (i+1)+"integer", new // BigInteger(valS),false, // true); // } } } else { // logger.debug("There is NO special treatment for this field->" // + field.name); } } } catch (XPathExpressionException x) { String xmlError = PrintUtils.printNodeInfo((NodeInfo) node, config); logger.error("XML DB->[{}]", xmlError); logger.error("Caught an exception while indexing expression [" + field.path + "] for document [" + ((NodeInfo) source).getStringValue().substring(0, 20) + "...]", x); throw new Exception("Xml:" + xmlError, x); } } } documentNodes = null; source = null; reader = null; xml = null; countNodes++; // logger.debug("count->[{}]", countNodes); // facet tests docBuilder.setCategoryPaths(sampleCategories); docBuilder.build(d); addIndexDocument(w, d); } } logger.debug("until now it were processed->[{}]", pageNumber * pageSizeDefault); pageNumber++; if (coll != null) { try { // coll.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } set = null; } setCountDocuments(countNodes); // add metadata to the lucene index Map<String, String> map = new HashMap<String, String>(); map.put("numberDocs", Integer.toString(countNodes)); map.put("date", Long.toString(System.nanoTime())); // logger.debug(Application.getInstance().getComponent("XmlDbConnectionPool").getMetaDataInformation()); // I cannot call directly // getComponent("XmlDbConnectionPool").getMetaDataInformation(), // because I can be working in a did String dbInfo = ((XmlDbConnectionPool) Application.getInstance().getComponent("XmlDbConnectionPool")) .getDBInfo(dbHost, dbPort, dbPassword, dbName); map.put("DBInfo", dbInfo); // facet taxoWriter.commit(); taxoWriter.close(); commitIndex(w, map); } catch (Exception x) { logger.error("Caught an exception:", x); w.close(); throw x; } } }