List of usage examples for org.dom4j Document getRootElement
Element getRootElement();
From source file:com.fivepebbles.Backup.java
License:MIT License
public void processBackup() { //Retrieve Bucket Names and related files/folders from XML saved locally Document myDocument = null; try {/*w w w .ja va 2s . com*/ URL myURL = new File("target", "s3files.xml").toURI().toURL(); SAXReader myReader = new SAXReader(); myDocument = myReader.read(myURL); } catch (MalformedURLException | DocumentException e) { //***TODO*** log Msg e.printStackTrace(); } Element root = myDocument.getRootElement(); for (Iterator<Element> i1 = root.elementIterator(); i1.hasNext();) { Element bucketelem = i1.next(); if (bucketelem.getName() == "bucket") { bucketnm = bucketelem.attributeValue("name"); for (Iterator<Element> i2 = bucketelem.elementIterator(); i2.hasNext();) { Element fileelem = i2.next(); if (fileelem.getName() == "file") { bfile = fileelem.getText(); //Get list of files (bfile could be a folder name) ProcessFiles p1 = new ProcessFiles(); filelist = p1.getFiles(bfile); //Append files to arraylist if (filelist != null) { totalfilelist.addAll(filelist); } } } //Make the data good for S3 //Replace "\" with "/" for Windows for (int j = 0; j < totalfilelist.size(); j++) { if (totalfilelist.get(j).contains("\\")) { newfilelist.add(totalfilelist.get(j).replace("\\", "/")); } else { newfilelist.add(totalfilelist.get(j)); } } //Remove Driveletter from files/object list if present (Windows) for (int k = 0; k < newfilelist.size(); k++) { if (newfilelist.get(k).contains("C:")) { newfilelist2.add(newfilelist.get(k).replace("C:", "")); } else { newfilelist2.add(newfilelist.get(k)); } } //Get S3 key list corresponding to the files //This is obtained by removing the "/" in index 0 from the file list since AWS key should not have a / at position 0 for (int m = 0; m < newfilelist2.size(); m++) { keylist.add(newfilelist2.get(m).substring(1)); } //Backup files in S3 (for this bucket) //Get AWS Credentials String[] awskeys = new S3Credentials().getCredentials(); //Set AWS credentials ProcessAWS pr1 = new ProcessAWS(awskeys[0], awskeys[1]); //Check if Bucket exists in S3 if (pr1.checkBucket(bucketnm)) { //Put Objects in S3 //keylist contains S3 keys and newfilelist2 contains the files for (int l = 0; l < newfilelist2.size(); l++) { boolean r1 = pr1.putAWSObject(keylist.get(l), new File(newfilelist2.get(l)), bucketnm); if (!r1) { //***TODO*** Log message } } } else { //Create Bucket in S3 boolean r2 = pr1.createBucket(bucketnm); if (r2) { //Put Objects in S3 //keylist contains S3 keys and newfilelist2 contains the files for (int m = 0; m < newfilelist2.size(); m++) { boolean r3 = pr1.putAWSObject(keylist.get(m), new File(newfilelist2.get(m)), bucketnm); if (!r3) { //***TODO*** Log message } } } else { //***TODO*** Log message } } } //Clear arrays for the next bucket totalfilelist.clear(); newfilelist.clear(); newfilelist2.clear(); keylist.clear(); } }
From source file:com.flaptor.hounder.indexer.CommandsModule.java
License:Apache License
/** * @param doc a non null document to process *//*from ww w . j a v a 2 s . c o m*/ public Document[] internalProcess(final Document doc) { Element root = doc.getRootElement(); if (null != root && root.getName().equals("command")) { String name = root.attributeValue("name"); if (null == name) { logger.error("Invalid command: no name set. Ignoring it."); } else { if (name.equals("optimize")) { optimize(); } else if (name.equals("close")) { close(); } else if (name.equals("checkpoint")) { checkpoint(); } else { logger.error("Unknown command received. Ignoring it."); } } Document[] docs = {}; return docs; } else { Document[] docs = { doc }; return docs; } }
From source file:com.flaptor.hounder.indexer.DocumentConverter.java
License:Apache License
/** *Converts a hounder's add document into a lucene document. *This method is thread safe./*w w w. j a v a2 s . c o m*/ *@throws IllegalArgumentException if the document is malformed, if it's not an add *if it does not contain the required fields, etc. */ public org.apache.lucene.document.Document convert(final Document doc) throws IllegalDocumentException { Element root = doc.getRootElement(); if (root.getName().equals("documentAdd")) { return processAdd(root); } else { throw new IllegalDocumentException("This is not an add document."); } }
From source file:com.flaptor.hounder.indexer.FieldFormatCheckerModule.java
License:Apache License
/** * Processes the document. Takes the xml document, prints it to the logger, * and returns the same document./*from www . j a va 2 s .c o m*/ */ protected Document[] internalProcess(final Document doc) { // check that this is a documentAdd // otherwise, skip. Node root = doc.getRootElement(); if (!root.getName().equals("documentAdd")) return new Document[] { doc }; for (String longField : longFields) { Node node = doc.selectSingleNode("//field[@name='" + longField + "']"); if (null == node) { logger.error("Document lacks field " + longField + ". Dropping document. "); if (logger.isDebugEnabled()) { logger.debug(DomUtil.domToString(doc) + " lacks field " + longField); } return new Document[0]; } String text = node.getText(); try { Long.parseLong(text); } catch (NumberFormatException e) { logger.error( "Document has field " + longField + ", but it is not parseable as Long. Dropping document"); if (logger.isDebugEnabled()) { logger.debug(DomUtil.domToString(doc) + " contains field " + longField + " but it is not parseable as Long. Node:" + node.toString() + " - text: " + text); } return new Document[0]; } } // TODO insert more field type checks here Document[] docs = { doc }; return docs; }
From source file:com.flaptor.hounder.indexer.HtmlParser.java
License:Apache License
/** * Parses the html document and extracts the indexable text. * /*from w w w .java2 s.c o m*/ * @param doc * the dom4j doc to process. * @return a single document, in wich some fields may have been added, using * the info from the "body" field. The fields are added at the same * level as the "body" and the original "body" is preserved. */ public final Document[] internalProcess(final Document doc) { Document[] docs = { doc }; // check that this is a documentAdd // otherwise, skip. Element root = doc.getRootElement(); if (!root.getName().equals("documentAdd")) return docs; try { for (Pair<String, String> tag : tags) { processTag(doc, tag.first(), tag.last()); } } catch (Exception e) { logger.warn("internalProcess: while running processTag:" + e.getMessage(), e); return null; } return docs; }
From source file:com.flaptor.hounder.indexer.SanitizerModule.java
License:Apache License
/** * @param doc a non null document to process *//*from w w w .j ava 2s . c o m*/ public Document[] internalProcess(final Document doc) { Element root = doc.getRootElement(); if (null != root) { for (String name : allFields) { Element elem = (Element) root.selectSingleNode(xpath.replace("$", name)); if (null != elem) { try { String text = elem.getText(); if (htmlFields.contains(name)) { text = htmlParser.parse("internal document", text.getBytes("UTF-8"), "UTF-8").getText(); } if (xmlFields.contains(name)) { text = DomUtil.filterXml(text); } if (accentFields.contains(name)) { text = filterAccents(text); } elem.setText(text); } catch (Exception e) { logger.warn("Sanitizing field " + name, e); } } } } Document[] docs = { doc }; return docs; }
From source file:com.flaptor.hounder.indexer.Writer.java
License:Apache License
/** * Implementation of the Module interface. Depending on the document * received, the Writer can insert or delete a document, or process it as a * command./*from ww w.j av a 2s.c om*/ * @see Writer */ protected Document[] internalProcess(final Document doc) { Element root = doc.getRootElement(); if (root.getName().equals("documentAdd")) { try { iwp.addDocument(DocumentConverter.getInstance().convert(doc)); } catch (DocumentConverter.IllegalDocumentException e) { logger.error( "Exception while converting this document to lucene. Check the document format. This document " + "won't be added to the index.", e); } } else if (root.getName().equals("documentDelete")) { processDelete(root); } else { logger.error("Invalid format received"); } return null; }
From source file:com.flaptor.hounder.searcher.XmlSearchHandler.java
License:Apache License
/** * Request can have the following parameters: * * query/* w ww . j a v a 2 s .co m*/ * start * hitsPerPage * categories * site * group = < "site" | "signature"> * orderBy * crawl * xsltUri * raw true|false * this method is a merge of search-base.jsp, opensearch.jsp and http://docs.codehaus.org/display/JETTY/Embedding+Jetty */ public void handle(String target, HttpServletRequest request, HttpServletResponse response, int dispatch) throws IOException, ServletException { response.setCharacterEncoding("utf-8"); PrintWriter pw = response.getWriter(); Document originalDom = doQuery(request, searcher); originalDom.getRootElement().addAttribute("SearchEngine", "Hounder (hounder.org)") .addAttribute("DevelopedBy", "Flaptor (flaptor.com)"); @SuppressWarnings("unchecked") String rawStr = getParameter(request.getParameterMap(), "raw"); if (Boolean.parseBoolean(rawStr) || transformMap.isEmpty()) { response.setContentType("text/xml"); String openSearchResults = DomUtil.domToString(originalDom); pw.print(openSearchResults); pw.flush(); } else { Pair<Transformer, String> value = transformMap.get(request.getPathInfo()); //System.out.println("XML HANDLE: path="+request.getPathInfo()+" value="+value); if (null == value) { response.sendError(HttpServletResponse.SC_NOT_FOUND, "There's no xslt to serve this context."); return; } Transformer transformer = value.first(); String contentType = value.last(); response.setContentType(contentType); try { synchronized (transformer) { transformer.transform(new DocumentSource(originalDom), new StreamResult(pw)); } } catch (TransformerException e) { logger.error( "internalProcess: exception while transforming document. (set error level to debug to see the offending document)", e); logger.debug("offending document was: " + DomUtil.domToString(originalDom)); response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, "Error processing internal xslt."); return; } } }
From source file:com.flaptor.hounder.util.HtmlParser.java
License:Apache License
/** * Parse the given html document./* w ww .jav a 2s. c o m*/ * @param content the html document to parse. * @return the parsed string. */ public Output parse(String url, String content) throws Exception { // <html xmlns=...> ==> <html> content = REGEXP_HTML.matcher(content).replaceFirst("<html>"); // Parser keeps state, synchronize in case its used in a multi-threaded setting. Output out = new Output(url); synchronized (this) { try { // use cyberneko to parse the html documents (even broken ones) org.xml.sax.InputSource inputSource = new org.xml.sax.InputSource( new java.io.ByteArrayInputStream(content.getBytes())); parser.parse(inputSource); } catch (Exception e) { logger.warn("Exception while trying to parse [" + content + "]"); throw e; } DOMReader reader = new DOMReader(); Document htmlDoc; try { // get the doc that resulted from parsing the text org.w3c.dom.Document document = parser.getDocument(); htmlDoc = reader.read(document); } catch (java.lang.StackOverflowError e) { logger.warn("Out of stack memory trying to parse [" + content + "]"); throw new Exception(); } // this 2 must be before the ignoreXPath, else an ignoreXPath that // includes the //TITLE will imply that the title is not indexed // extract the links extractLinks(htmlDoc, out); // extact the title extractTitle(htmlDoc, out); ignoreXpath(htmlDoc); replaceSeparatorTags(htmlDoc); // extract the text from the html tags extractText(htmlDoc.getRootElement(), out, HTMLPARSER_CONTENT); // extract special fields extractFields(htmlDoc, out); } out.close(); return out; }
From source file:com.flaptor.util.DomUtil.java
License:Apache License
public static void main(String[] arg) throws Exception { String str = FileUtil.readFile(new File(arg[0])); HtmlParser parser = new HtmlParser(); Document htmlDocument = parser.getHtmlDocument("http://url.com", str.getBytes()); System.out.println(getElementTextRecursively(htmlDocument.getRootElement())); }