List of usage examples for org.dom4j Element addElement
Element addElement(String name);
Element
node with the given name to this branch and returns a reference to the new node. From source file:com.fivepebbles.ProcessFiles.java
License:MIT License
@Override public void setFiles(String[][] valuesin) { //Create a XML file to hold AWS bucket names and related files/folders Document mydoc = DocumentHelper.createDocument(); Element root = mydoc.addElement("s3backup"); Element bucket1 = null;/*from w w w . j a v a 2 s. com*/ for (int a1 = 0; a1 < valuesin.length; a1++) { for (int b1 = 0; b1 < valuesin[a1].length; b1++) { //Bucket names are in index 0 if (b1 == 0) { bucket1 = root.addElement("bucket").addAttribute("name", valuesin[a1][b1]); } else { if (bucket1 != null & valuesin[a1][b1] != null) { bucket1.addElement("file").addText(valuesin[a1][b1]); } } } bucket1 = null; } //Save files and folders in file system FileWriter fWriter2; try { fWriter2 = new FileWriter(new File("target", "s3files.xml")); mydoc.write(fWriter2); fWriter2.flush(); fWriter2.close(); } catch (IOException e) { //***TODO*** log message e.printStackTrace(); } }
From source file:com.flaptor.hounder.crawler.modules.IndexerModule.java
License:Apache License
private void deleteFromIndex(Page page) { org.dom4j.Document dom = DocumentHelper.createDocument(); Element root = dom.addElement("documentDelete"); root.addElement("documentId").addText(getDocumentId(page)); try {//w ww .jav a 2 s .c o m int i = pageMapper.mapPage(page); while (indexers[i].index(dom) == IndexerReturnCode.RETRY_QUEUE_FULL) { try { Thread.sleep(indexerBusyRetryTime * 1000); } catch (InterruptedException e) { logger.debug("Sleep interrupted: " + e, e); } } page.setEmitted(false); } catch (Exception e) { logger.error(e, e); } }
From source file:com.flaptor.hounder.crawler.modules.IndexerModule.java
License:Apache License
@SuppressWarnings("unchecked") protected void addToIndex(FetchDocument doc) { byte[] content = doc.getContent(); if (0 == content.length) { logger.warn("Page has no data. Ignoring this document."); return;//from w ww . j a va 2 s . c o m } Set<String> categories = doc.getCategories(); Map<String, Object> attributes = doc.getAttributes(); Map<String, Object> indexableAttributes = doc.getIndexableAttributes(); // build xml doc org.dom4j.Document dom = DocumentHelper.createDocument(); Element root = dom.addElement("documentAdd"); Page page = doc.getPage(); String text = doc.getText(); String url = page.getUrl(); String host = getHost(url); String title = doc.getTitle(titleLengthLimit); String tokenizedHost = tokenizeHost(host); String anchorText = getAnchorText(page); float categoryBoost = calculateCategoryBoost(attributes); float pagerankBoost = calculatePagerankBoost(page); float spamrankBoost = calculateSpamrankBoost(page); float logBoost = calculateLogBoost(page); float freshnessBoost = calculateFreshnessBoost(page); // add overall score float f1 = factor("category", categoryBoost, categoryBoostDamp); float f2 = factor("pagerank", pagerankBoost, pagerankBoostDamp); float f3 = factor("spamrank", spamrankBoost, spamrankBoostDamp); float f4 = factor("log", logBoost, logBoostDamp); float f5 = factor("freshness", freshnessBoost, freshnessBoostDamp); float f6 = ((Double) attributes.get("boost")).floatValue(); // as calculated by the boost module, or 1.0 if no boost module is defined. float boost = f1 * f2 * f3 * f4 * f5 * f6; // System.out.println("BOOST url=["+url+"] category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+") pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+") log="+f3+" ("+logBoost+":"+logBoostDamp+") freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+" Boost="+boost); if (boost < 1e-6f) { logger.warn("Boost too low! (" + boost + ") category=" + f1 + " (" + categoryBoost + ":" + categoryBoostDamp + ") pagerank=" + f2 + " (" + pagerankBoost + ":" + pagerankBoostDamp + ") spamrank=" + f3 + " (" + spamrankBoost + ":" + spamrankBoostDamp + ") log=" + f4 + " (" + logBoost + ":" + logBoostDamp + ") freshness=" + f5 + " (" + freshnessBoost + ":" + freshnessBoostDamp + ") moduleBoost=" + f6); boost = 1e-6f; } if (null == title || "".equals(title)) { title = "Untitled"; } root.addElement("boost").addText(String.valueOf(boost)); root.addElement("documentId").addText(getDocumentId(page)); Map<String, Double> boostMap = (Map<String, Double>) attributes.get("field_boost"); // add the search fields addField(root, "url", url, true, true, true, boostMap); addField(root, "site", host, true, true, false, boostMap); addField(root, "tokenizedHost", tokenizedHost, false, true, true, boostMap); addField(root, "title", title, true, true, true, boostMap); addField(root, "text", text, true, true, true, boostMap); addField(root, "anchor", anchorText, false, true, true, boostMap); addField(root, "crawl", crawlName, false, true, true, boostMap); if (sendContent) { addBody(root, doc, content); } // for debugging only //addField(root, "boostinfo", "category="+f1+" ("+categoryBoost+":"+categoryBoostDamp+") pagerank="+f2+" ("+pagerankBoost+":"+pagerankBoostDamp+") log="+f3+" ("+logBoost+":"+logBoostDamp+") freshness="+f4+" ("+freshnessBoost+":"+freshnessBoostDamp+") moduleBoost="+f5+" Boost="+boost, true, false, false, null); addAdditionalFields(root, page, boostMap); // Adding metainfo from attributes Set<Entry<String, Object>> attributeSet = indexableAttributes.entrySet(); for (Entry<String, Object> attribute : attributeSet) { addField(root, attribute.getKey(), attribute.getValue() == null ? "" : attribute.getValue().toString(), true, true, true, boostMap); } StringBuffer assignedCategories = new StringBuffer(); if (null != categories) { // iterate through the classes the page belongs to add each category and its score for (Iterator<String> iter = categories.iterator(); iter.hasNext();) { assignedCategories.append(iter.next()); assignedCategories.append(" "); // repeat the field times proportional to the score (this is a way to boost the document by category); //for (int rep = 0; rep < score*10; rep++) { // addField(root, "categoryBoost", categ, false, true, false); //} } addField(root, "categories", assignedCategories.toString().trim(), true, true, true, boostMap); } if (logger.isDebugEnabled()) { logger.debug("Indexing dom: " + DomUtil.domToString(dom)); } // Send the document to the indexer. If the queue is full, wait and retry. try { int i = pageMapper.mapPage(page); while (indexers[i].index(dom) == IndexerReturnCode.RETRY_QUEUE_FULL) { try { Thread.sleep(indexerBusyRetryTime * 1000); } catch (InterruptedException e) { logger.debug("Sleep interrupted: " + e, e); } } page.setEmitted(true); } catch (Exception e) { logger.error(e, e); } }
From source file:com.flaptor.hounder.crawler.modules.IndexerModule.java
License:Apache License
/** * Adds a new field to the <code>doc</code> Element. * // www . ja va2 s. c o m * @param doc the element to add the field to * @param name the name of the field * @param value the String value for the field * @param stored true iif should be stored * @param indexed true iif should be indexed * @param tokenized true iif should be tokenized * @param boostMap map containing the boosts for each field name */ protected final void addField(Element doc, String name, String value, boolean stored, boolean indexed, boolean tokenized, Map<String, Double> boostMap) { Double boost = 1.0d; if (null != boostMap && boostMap.containsKey(name)) { boost = boostMap.get(name); } doc.addElement("field").addAttribute("name", name).addAttribute("stored", Boolean.toString(stored)) .addAttribute("indexed", Boolean.toString(indexed)) .addAttribute("tokenized", Boolean.toString(tokenized)).addAttribute("boost", boost.toString()) .addText(value); }
From source file:com.flaptor.hounder.crawler.modules.IndexerModule.java
License:Apache License
protected final void addBody(Element doc, FetchDocument fetchDoc, byte[] bytes) { String encoding = null;//from www .j av a 2 s. c o m // find charset. http headers usually have a Content-Type line, but // as it may not be in the same case, all headers are stored lowercased. // Content-Type lines contain mime-type and charset, separated by ; // for example // Content-Type: text/html; charset=UTF-8 if (fetchDoc.getHeader().containsKey("content-type")) { String[] tokens = fetchDoc.getHeader().get("content-type").split(";"); for (String token : tokens) { if (token.toLowerCase().contains("charset") && token.contains("=")) { encoding = token.split("=")[1].trim().toUpperCase(); break; } } } // if not found, use default encoding if (null == encoding) { encoding = java.nio.charset.Charset.defaultCharset().name(); } try { doc.addElement("body").addText(new String(bytes, encoding)); } catch (java.io.UnsupportedEncodingException e) { logger.error("while adding body: ", e); } }
From source file:com.flaptor.hounder.indexer.RmiIndexerStub.java
License:Apache License
private static Document generateDeleteDocument(String url) { org.dom4j.Document dom = DocumentHelper.createDocument(); Element root = dom.addElement("documentDelete"); root.addElement("documentId").addText(url); return dom;/*from w ww. j ava 2s . com*/ }
From source file:com.flaptor.hounder.indexer.SanitizerModule.java
License:Apache License
public static void main(String[] args) { String text = args[0];// w w w . ja v a2s . c o m Document doc = DocumentHelper.createDocument(); Element root = doc.addElement("documentAdd"); root.addElement("text").addText(text); root.addElement("field").addAttribute("name", "text").addAttribute("indexed", "true") .addAttribute("stored", "true").addAttribute("tokenized", "true").addText(text); SanitizerModule mod = new SanitizerModule(); Document[] docs = mod.internalProcess(doc); for (Document d : docs) { System.out.println(DomUtil.domToString(d)); } }
From source file:com.flaptor.hounder.searcher.OpenSearch.java
License:Apache License
/** * Creates a OpenSearch's compatible DOM document. * The generated dom contains only valid xml characters (infringing chars are removed). * Compliant with OpenSearch 1.0 with most of the Nutch 0.8.1 extensions. * @param baseUrl the url of the webapp//from w w w .jav a 2 s . c o m * @param htmlSearcher the name of the component (servlet/jsp) that returns the search results in an HTML page * @param opensearchSearcher the name of the component (servlet/jsp) that returns the search results in an OpenSearch RSS page * @param extraParams the parameters present in the request, not passed explicitly (such as sort, reverse, etc.) * @param queryString the query string, as entered by the user * @param start the offset of the first result * @param count the number of results requested (the actual number of results found may be smaller) * @param sr the SearchResults structure containing the result of performing the query * @return a DOM document * <br>An empty sr argument means that no results were found. */ public static final Document buildDom_1_0(String baseUrl, String htmlSearcher, String opensearchSearcher, String extraParams, String queryString, int start, int count, GroupedSearchResults sr, int status, String statusMessage, boolean useXslt) { String encodedQuery = null; try { encodedQuery = URLEncoder.encode(queryString, "UTF-8"); } catch (UnsupportedEncodingException e) { // Should never happen! encodedQuery = ""; } Document dom = DocumentHelper.createDocument(); if (useXslt) { Map<String, String> map = new HashMap<String, String>(); map.put("type", "text/xsl"); map.put("href", xsltPath); dom.addProcessingInstruction("xml-stylesheet", map); } Namespace opensearchNs = DocumentHelper.createNamespace("opensearch", XMLNS_A9_OPENSEARCH_1_0); Namespace hounderNs = DocumentHelper.createNamespace("hounder", XMLNS_HOUNDER_OPENSEARCH_1_0); Element root; Element channel; if (!useXslt) { root = dom.addElement("rss").addAttribute("version", "2.0"); channel = root.addElement("channel"); } else { channel = dom.addElement("searchResults"); root = channel; } root.add(opensearchNs); root.add(hounderNs); channel.addElement("title").addText(titlePrefix + " " + DomUtil.filterXml(queryString)); channel.addElement("link") .addText(baseUrl + "/" + htmlSearcher + "?query=" + encodedQuery + "&start=" + start + extraParams); channel.addElement("description").addText(descPrefix + " " + DomUtil.filterXml(queryString)); channel.addElement(QName.get("totalResults", opensearchNs)) .addText(Integer.toString(sr.totalGroupsEstimation())); channel.addElement(QName.get("startIndex", opensearchNs)).addText(Integer.toString(start)); channel.addElement(QName.get("itemsPerPage", opensearchNs)).addText(Integer.toString(count)); channel.addElement(QName.get("query", hounderNs)).addText(DomUtil.filterXml(queryString)); AQuery suggestedQuery = sr.getSuggestedQuery(); if (null != suggestedQuery) { channel.addElement(QName.get("suggestedQuery", hounderNs)) .addText(DomUtil.filterXml(suggestedQuery.toString())); } channel.addElement(QName.get("status", hounderNs)).addText(Integer.toString(status)); channel.addElement(QName.get("statusDesc", hounderNs)).addText(statusMessage); if (sr.lastDocumentOffset() > 0) { channel.addElement(QName.get("nextPage", hounderNs)).addText(baseUrl + "/" + opensearchSearcher + "?query=" + encodedQuery + "&start=" + (sr.lastDocumentOffset()) + extraParams); } for (int i = 0; i < sr.groups(); i++) { Vector<org.apache.lucene.document.Document> docs = sr.getGroup(i).last(); Element parent = null; for (int j = 0; j < docs.size(); j++) { org.apache.lucene.document.Document doc = sr.getGroup(i).last().get(j); if (0 == j) {// j=0 is head of group. j>0 is tail parent = createAndAddElement(doc, channel, hounderNs); } else { createAndAddElement(doc, parent, hounderNs); } } } return dom; }
From source file:com.flaptor.hounder.searcher.OpenSearch.java
License:Apache License
private static Element createAndAddElement(org.apache.lucene.document.Document doc, Element parent, Namespace hounderNs) {/*from ww w. j ava 2 s . c o m*/ String link = StringUtil.nullToEmpty(doc.get(linkField)).trim(); String description = StringUtil.nullToEmpty(doc.get(descField)).trim(); String title = StringUtil.nullToEmpty(doc.get(titleField)).trim(); if ("".equals(title)) { title = link; } Element item = parent.addElement("item"); item.addElement("title").addText(DomUtil.filterXml(title)); item.addElement("link").addText(linkPrefix + DomUtil.filterXml(link)); String desc = DomUtil.filterXml(description); System.out.println("==================================================================================="); System.out.println("description: " + description); System.out.println("-----------------------------------------------------------------------------------"); System.out.println("desc: " + desc); System.out.println("==================================================================================="); item.addElement("description").addText(desc); for (Iterator iter = doc.getFields().iterator(); iter.hasNext();) { Field f = (Field) iter.next(); if (fieldsToShow.contains(f.name())) { item.addElement(QName.get(f.name(), hounderNs)).addText(DomUtil.filterXml(f.stringValue())); } } return item; }
From source file:com.flaptor.hounder.searcher.XmlResults.java
License:Apache License
/** * Creates a XML search results document (verbose version). * The generated dom contains only valid xml characters (infringing chars are removed). * @param queryString the query string, as entered by the user * @param start the offset of the first result * @param count the number of results requested (the actual number of results found may be smaller) * @param orderBy the field by which the results are sorted * @param sr the GroupedSearchResults structure containing the result of performing the query * @param status the code returned by the searcher * @param statusMsg the status description * @param xsltUri the uri for the xslt used to process the xml on the client side, * or null if no client-side processing is needed * @param rangeField field for which a range filter will be applied, or null if no filter used. * @param rangeStart start value for the range filter. * @param rangeEnd end value for the range filter. * @param params a map of parameters sent to the searcher with the request. * @return a DOM document//from w w w .j av a 2 s .c o m * <br>An empty sr argument means that no results were found. */ public static final Document buildXml(String queryString, int start, int count, String orderBy, GroupedSearchResults sr, int status, String statusMsg, String xsltUri, String rangeField, String rangeStart, String rangeEnd, Map<String, String[]> params) { Document dom = DocumentHelper.createDocument(); if (null != xsltUri) { Map<String, String> map = new HashMap<String, String>(); map.put("type", "text/xsl"); map.put("href", xsltUri); dom.addProcessingInstruction("xml-stylesheet", map); } Element root; Element group; root = dom.addElement("SearchResults"); root.addElement("totalResults").addText(Integer.toString(sr.totalResults())); root.addElement("totalGroupsEstimation").addText(Integer.toString(sr.totalGroupsEstimation())); if (count > 0) { root.addElement("startIndex").addText(Integer.toString(start)); } if (count > 0) { root.addElement("itemsPerPage").addText(Integer.toString(count)); } if (null != orderBy) { root.addElement("orderBy").addText(DomUtil.filterXml(orderBy)); } if (null != queryString) { root.addElement("query").addText(DomUtil.filterXml(queryString)); } if (null != rangeField) { root.addElement("filter").addAttribute("field", rangeField).addAttribute("start", rangeStart) .addAttribute("end", rangeEnd); } if (null != params) { for (String key : params.keySet()) { if (null == root.selectSingleNode(key)) { String val = params.get(key)[0]; root.addElement(key).addText(val); } } } AQuery suggestedQuery = sr.getSuggestedQuery(); if (null != suggestedQuery) { root.addElement("suggestedQuery") .addText(DomUtil.filterXml(((LazyParsedQuery) suggestedQuery).getQueryString())); } root.addElement("status").addText(Integer.toString(status)); root.addElement("statusDesc").addText(statusMsg); for (int i = 0; i < sr.groups(); i++) { String name = sr.getGroup(i).first(); group = root.addElement("group").addAttribute("name", name); Vector<org.apache.lucene.document.Document> docs = sr.getGroup(i).last(); for (int j = 0; j < docs.size(); j++) { org.apache.lucene.document.Document doc = sr.getGroup(i).last().get(j); createAndAddElement(doc, group); } } return dom; }