Example usage for org.dom4j Document getRootElement

Introduction

In this page you can find the example usage for org.dom4j Document getRootElement.

Prototype

Element getRootElement();

Source Link

Document

Returns the root Element for this document.

Usage

From source file:com.fivepebbles.Backup.java

License:MIT License

public void processBackup() {

    //Retrieve Bucket Names and related files/folders from XML saved locally

    Document myDocument = null;

    try {/*w w  w  .ja va 2s .  com*/
        URL myURL = new File("target", "s3files.xml").toURI().toURL();
        SAXReader myReader = new SAXReader();
        myDocument = myReader.read(myURL);
    } catch (MalformedURLException | DocumentException e) {
        //***TODO*** log Msg
        e.printStackTrace();
    }

    Element root = myDocument.getRootElement();

    for (Iterator<Element> i1 = root.elementIterator(); i1.hasNext();) {
        Element bucketelem = i1.next();

        if (bucketelem.getName() == "bucket") {
            bucketnm = bucketelem.attributeValue("name");

            for (Iterator<Element> i2 = bucketelem.elementIterator(); i2.hasNext();) {
                Element fileelem = i2.next();

                if (fileelem.getName() == "file") {
                    bfile = fileelem.getText();

                    //Get list of files (bfile could be a folder name)
                    ProcessFiles p1 = new ProcessFiles();
                    filelist = p1.getFiles(bfile);

                    //Append files to arraylist
                    if (filelist != null) {
                        totalfilelist.addAll(filelist);
                    }
                }
            }

            //Make the data good for S3
            //Replace "\" with "/" for Windows

            for (int j = 0; j < totalfilelist.size(); j++) {

                if (totalfilelist.get(j).contains("\\")) {
                    newfilelist.add(totalfilelist.get(j).replace("\\", "/"));
                } else {
                    newfilelist.add(totalfilelist.get(j));
                }
            }

            //Remove Driveletter from files/object list if present (Windows)
            for (int k = 0; k < newfilelist.size(); k++) {
                if (newfilelist.get(k).contains("C:")) {
                    newfilelist2.add(newfilelist.get(k).replace("C:", ""));
                } else {
                    newfilelist2.add(newfilelist.get(k));
                }
            }

            //Get S3 key list corresponding to the files 
            //This is obtained by removing the "/" in index 0 from the file list since AWS key should not have a / at position 0

            for (int m = 0; m < newfilelist2.size(); m++) {
                keylist.add(newfilelist2.get(m).substring(1));
            }

            //Backup files in S3 (for this bucket)

            //Get AWS Credentials
            String[] awskeys = new S3Credentials().getCredentials();

            //Set AWS credentials
            ProcessAWS pr1 = new ProcessAWS(awskeys[0], awskeys[1]);

            //Check if Bucket exists in S3
            if (pr1.checkBucket(bucketnm)) {

                //Put Objects in S3
                //keylist contains S3 keys and newfilelist2 contains the files
                for (int l = 0; l < newfilelist2.size(); l++) {
                    boolean r1 = pr1.putAWSObject(keylist.get(l), new File(newfilelist2.get(l)), bucketnm);

                    if (!r1) {
                        //***TODO*** Log message
                    }
                }
            } else {
                //Create Bucket in S3
                boolean r2 = pr1.createBucket(bucketnm);

                if (r2) {
                    //Put Objects in S3
                    //keylist contains S3 keys and newfilelist2 contains the files
                    for (int m = 0; m < newfilelist2.size(); m++) {
                        boolean r3 = pr1.putAWSObject(keylist.get(m), new File(newfilelist2.get(m)), bucketnm);

                        if (!r3) {
                            //***TODO*** Log message
                        }
                    }

                } else {
                    //***TODO*** Log message
                }
            }

        }

        //Clear arrays for the next bucket
        totalfilelist.clear();
        newfilelist.clear();
        newfilelist2.clear();
        keylist.clear();

    }

}

From source file:com.flaptor.hounder.indexer.CommandsModule.java

License:Apache License

/**
 * @param doc a non null document to process
 *//*from   ww  w  .  j a  v  a  2 s . c o  m*/
public Document[] internalProcess(final Document doc) {
    Element root = doc.getRootElement();
    if (null != root && root.getName().equals("command")) {
        String name = root.attributeValue("name");
        if (null == name) {
            logger.error("Invalid command: no name set. Ignoring it.");
        } else {
            if (name.equals("optimize")) {
                optimize();
            } else if (name.equals("close")) {
                close();
            } else if (name.equals("checkpoint")) {
                checkpoint();
            } else {
                logger.error("Unknown command received. Ignoring it.");
            }
        }
        Document[] docs = {};
        return docs;
    } else {
        Document[] docs = { doc };
        return docs;
    }
}

From source file:com.flaptor.hounder.indexer.DocumentConverter.java

License:Apache License

/**
 *Converts a hounder's add document into a lucene document.
 *This method is thread safe./*w w  w.  j  a  v  a2 s .  c o m*/
 *@throws IllegalArgumentException if the document is malformed, if it's not an add
 *if it does not contain the required fields, etc.
*/
public org.apache.lucene.document.Document convert(final Document doc) throws IllegalDocumentException {
    Element root = doc.getRootElement();
    if (root.getName().equals("documentAdd")) {
        return processAdd(root);
    } else {
        throw new IllegalDocumentException("This is not an add document.");
    }
}

From source file:com.flaptor.hounder.indexer.FieldFormatCheckerModule.java

License:Apache License

/**
 * Processes the document. Takes the xml document, prints it to the logger,
 * and returns the same document./*from   www  .  j a va  2 s  .c o m*/
 */
protected Document[] internalProcess(final Document doc) {

    // check that this is a documentAdd
    // otherwise, skip.
    Node root = doc.getRootElement();
    if (!root.getName().equals("documentAdd"))
        return new Document[] { doc };

    for (String longField : longFields) {
        Node node = doc.selectSingleNode("//field[@name='" + longField + "']");
        if (null == node) {
            logger.error("Document lacks field " + longField + ". Dropping document. ");
            if (logger.isDebugEnabled()) {
                logger.debug(DomUtil.domToString(doc) + " lacks field " + longField);
            }
            return new Document[0];
        }

        String text = node.getText();
        try {
            Long.parseLong(text);
        } catch (NumberFormatException e) {
            logger.error(
                    "Document has field " + longField + ", but it is not parseable as Long. Dropping document");
            if (logger.isDebugEnabled()) {
                logger.debug(DomUtil.domToString(doc) + " contains field " + longField
                        + " but it is not parseable as Long. Node:" + node.toString() + " - text: " + text);
            }
            return new Document[0];
        }
    }

    // TODO insert more field type checks here
    Document[] docs = { doc };
    return docs;
}

From source file:com.flaptor.hounder.indexer.HtmlParser.java

License:Apache License

/**
 * Parses the html document and extracts the indexable text.
 * /*from   w w  w .java2 s.c  o  m*/
 * @param doc
 *            the dom4j doc to process.
 * @return a single document, in wich some fields may have been added, using
 *         the info from the "body" field. The fields are added at the same
 *         level as the "body" and the original "body" is preserved.
 */
public final Document[] internalProcess(final Document doc) {
    Document[] docs = { doc };

    // check that this is a documentAdd
    // otherwise, skip.
    Element root = doc.getRootElement();
    if (!root.getName().equals("documentAdd"))
        return docs;

    try {
        for (Pair<String, String> tag : tags) {
            processTag(doc, tag.first(), tag.last());
        }
    } catch (Exception e) {
        logger.warn("internalProcess: while running processTag:" + e.getMessage(), e);
        return null;
    }
    return docs;
}

From source file:com.flaptor.hounder.indexer.SanitizerModule.java

License:Apache License

/**
 * @param doc a non null document to process
 *//*from w  w  w .j ava 2s . c  o  m*/
public Document[] internalProcess(final Document doc) {
    Element root = doc.getRootElement();
    if (null != root) {
        for (String name : allFields) {
            Element elem = (Element) root.selectSingleNode(xpath.replace("$", name));
            if (null != elem) {
                try {
                    String text = elem.getText();

                    if (htmlFields.contains(name)) {
                        text = htmlParser.parse("internal document", text.getBytes("UTF-8"), "UTF-8").getText();
                    }

                    if (xmlFields.contains(name)) {
                        text = DomUtil.filterXml(text);
                    }

                    if (accentFields.contains(name)) {
                        text = filterAccents(text);
                    }

                    elem.setText(text);
                } catch (Exception e) {
                    logger.warn("Sanitizing field " + name, e);
                }
            }
        }
    }
    Document[] docs = { doc };
    return docs;
}

From source file:com.flaptor.hounder.indexer.Writer.java

License:Apache License

/**
 * Implementation of the Module interface. Depending on the document
 * received, the Writer can insert or delete a document, or process it as a
 * command./*from ww w.j av a 2s.c  om*/
 * @see Writer
 */
protected Document[] internalProcess(final Document doc) {
    Element root = doc.getRootElement();
    if (root.getName().equals("documentAdd")) {
        try {
            iwp.addDocument(DocumentConverter.getInstance().convert(doc));
        } catch (DocumentConverter.IllegalDocumentException e) {
            logger.error(
                    "Exception while converting this document to lucene. Check the document format. This document "
                            + "won't be added to the index.",
                    e);
        }
    } else if (root.getName().equals("documentDelete")) {
        processDelete(root);
    } else {
        logger.error("Invalid format received");
    }
    return null;
}

From source file:com.flaptor.hounder.searcher.XmlSearchHandler.java

License:Apache License

/**
 * Request can have the following parameters:
 *
 * query/* w ww  .  j a v  a 2  s  .co  m*/
 * start
 * hitsPerPage
 * categories
 * site
 * group = < "site" | "signature">
 * orderBy
 * crawl
 * xsltUri
 * raw true|false
 * this method is a merge of search-base.jsp, opensearch.jsp and http://docs.codehaus.org/display/JETTY/Embedding+Jetty
 */
public void handle(String target, HttpServletRequest request, HttpServletResponse response, int dispatch)
        throws IOException, ServletException {
    response.setCharacterEncoding("utf-8");
    PrintWriter pw = response.getWriter();

    Document originalDom = doQuery(request, searcher);
    originalDom.getRootElement().addAttribute("SearchEngine", "Hounder (hounder.org)")
            .addAttribute("DevelopedBy", "Flaptor (flaptor.com)");

    @SuppressWarnings("unchecked")
    String rawStr = getParameter(request.getParameterMap(), "raw");
    if (Boolean.parseBoolean(rawStr) || transformMap.isEmpty()) {
        response.setContentType("text/xml");
        String openSearchResults = DomUtil.domToString(originalDom);
        pw.print(openSearchResults);
        pw.flush();
    } else {
        Pair<Transformer, String> value = transformMap.get(request.getPathInfo());
        //System.out.println("XML HANDLE: path="+request.getPathInfo()+"  value="+value);
        if (null == value) {
            response.sendError(HttpServletResponse.SC_NOT_FOUND, "There's no xslt to serve this context.");
            return;
        }
        Transformer transformer = value.first();
        String contentType = value.last();

        response.setContentType(contentType);
        try {
            synchronized (transformer) {
                transformer.transform(new DocumentSource(originalDom), new StreamResult(pw));
            }
        } catch (TransformerException e) {
            logger.error(
                    "internalProcess: exception while transforming document. (set error level to debug to see the offending document)",
                    e);
            logger.debug("offending document was: " + DomUtil.domToString(originalDom));
            response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, "Error processing internal xslt.");
            return;
        }
    }
}

From source file:com.flaptor.hounder.util.HtmlParser.java

License:Apache License

/**
 * Parse the given html document./* w ww  .jav a  2s. c o  m*/
 * @param content the html document to parse.
 * @return the parsed string.
 */
public Output parse(String url, String content) throws Exception {
    // <html xmlns=...>  ==>   <html>
    content = REGEXP_HTML.matcher(content).replaceFirst("<html>");
    // Parser keeps state, synchronize in case its used in a multi-threaded setting.
    Output out = new Output(url);
    synchronized (this) {
        try {
            // use cyberneko to parse the html documents (even broken ones)
            org.xml.sax.InputSource inputSource = new org.xml.sax.InputSource(
                    new java.io.ByteArrayInputStream(content.getBytes()));
            parser.parse(inputSource);
        } catch (Exception e) {
            logger.warn("Exception while trying to parse [" + content + "]");
            throw e;
        }
        DOMReader reader = new DOMReader();
        Document htmlDoc;
        try {
            // get the doc that resulted from parsing the text                
            org.w3c.dom.Document document = parser.getDocument();
            htmlDoc = reader.read(document);
        } catch (java.lang.StackOverflowError e) {
            logger.warn("Out of stack memory trying to parse [" + content + "]");
            throw new Exception();
        }
        // this 2 must be before the ignoreXPath, else an ignoreXPath that
        // includes the //TITLE will imply that the title is not indexed
        // extract the links
        extractLinks(htmlDoc, out);

        // extact the title
        extractTitle(htmlDoc, out);

        ignoreXpath(htmlDoc);

        replaceSeparatorTags(htmlDoc);

        // extract the text from the html tags
        extractText(htmlDoc.getRootElement(), out, HTMLPARSER_CONTENT);

        // extract special fields
        extractFields(htmlDoc, out);
    }
    out.close();
    return out;
}

From source file:com.flaptor.util.DomUtil.java

License:Apache License

public static void main(String[] arg) throws Exception {
    String str = FileUtil.readFile(new File(arg[0]));
    HtmlParser parser = new HtmlParser();
    Document htmlDocument = parser.getHtmlDocument("http://url.com", str.getBytes());

    System.out.println(getElementTextRecursively(htmlDocument.getRootElement()));

}