Example usage for org.dom4j Document getRootElement

List of usage examples for org.dom4j Document getRootElement

Introduction

In this page you can find the example usage for org.dom4j Document getRootElement.

Prototype

Element getRootElement();

Source Link

Document

Returns the root Element for this document.

Usage

From source file:com.fivepebbles.Backup.java

License:MIT License

public void processBackup() {

    //Retrieve Bucket Names and related files/folders from XML saved locally

    Document myDocument = null;

    try {/*w w  w  .ja va 2s .  com*/
        URL myURL = new File("target", "s3files.xml").toURI().toURL();
        SAXReader myReader = new SAXReader();
        myDocument = myReader.read(myURL);
    } catch (MalformedURLException | DocumentException e) {
        //***TODO*** log Msg
        e.printStackTrace();
    }

    Element root = myDocument.getRootElement();

    for (Iterator<Element> i1 = root.elementIterator(); i1.hasNext();) {
        Element bucketelem = i1.next();

        if (bucketelem.getName() == "bucket") {
            bucketnm = bucketelem.attributeValue("name");

            for (Iterator<Element> i2 = bucketelem.elementIterator(); i2.hasNext();) {
                Element fileelem = i2.next();

                if (fileelem.getName() == "file") {
                    bfile = fileelem.getText();

                    //Get list of files (bfile could be a folder name)
                    ProcessFiles p1 = new ProcessFiles();
                    filelist = p1.getFiles(bfile);

                    //Append files to arraylist
                    if (filelist != null) {
                        totalfilelist.addAll(filelist);
                    }
                }
            }

            //Make the data good for S3
            //Replace "\" with "/" for Windows

            for (int j = 0; j < totalfilelist.size(); j++) {

                if (totalfilelist.get(j).contains("\\")) {
                    newfilelist.add(totalfilelist.get(j).replace("\\", "/"));
                } else {
                    newfilelist.add(totalfilelist.get(j));
                }
            }

            //Remove Driveletter from files/object list if present (Windows)
            for (int k = 0; k < newfilelist.size(); k++) {
                if (newfilelist.get(k).contains("C:")) {
                    newfilelist2.add(newfilelist.get(k).replace("C:", ""));
                } else {
                    newfilelist2.add(newfilelist.get(k));
                }
            }

            //Get S3 key list corresponding to the files 
            //This is obtained by removing the "/" in index 0 from the file list since AWS key should not have a / at position 0

            for (int m = 0; m < newfilelist2.size(); m++) {
                keylist.add(newfilelist2.get(m).substring(1));
            }

            //Backup files in S3 (for this bucket)

            //Get AWS Credentials
            String[] awskeys = new S3Credentials().getCredentials();

            //Set AWS credentials
            ProcessAWS pr1 = new ProcessAWS(awskeys[0], awskeys[1]);

            //Check if Bucket exists in S3
            if (pr1.checkBucket(bucketnm)) {

                //Put Objects in S3
                //keylist contains S3 keys and newfilelist2 contains the files
                for (int l = 0; l < newfilelist2.size(); l++) {
                    boolean r1 = pr1.putAWSObject(keylist.get(l), new File(newfilelist2.get(l)), bucketnm);

                    if (!r1) {
                        //***TODO*** Log message
                    }
                }
            } else {
                //Create Bucket in S3
                boolean r2 = pr1.createBucket(bucketnm);

                if (r2) {
                    //Put Objects in S3
                    //keylist contains S3 keys and newfilelist2 contains the files
                    for (int m = 0; m < newfilelist2.size(); m++) {
                        boolean r3 = pr1.putAWSObject(keylist.get(m), new File(newfilelist2.get(m)), bucketnm);

                        if (!r3) {
                            //***TODO*** Log message
                        }
                    }

                } else {
                    //***TODO*** Log message
                }
            }

        }

        //Clear arrays for the next bucket
        totalfilelist.clear();
        newfilelist.clear();
        newfilelist2.clear();
        keylist.clear();

    }

}

From source file:com.flaptor.hounder.indexer.CommandsModule.java

License:Apache License

/**
 * @param doc a non null document to process
 *//*from   ww  w  .  j a  v  a  2 s . c o  m*/
public Document[] internalProcess(final Document doc) {
    Element root = doc.getRootElement();
    if (null != root && root.getName().equals("command")) {
        String name = root.attributeValue("name");
        if (null == name) {
            logger.error("Invalid command: no name set. Ignoring it.");
        } else {
            if (name.equals("optimize")) {
                optimize();
            } else if (name.equals("close")) {
                close();
            } else if (name.equals("checkpoint")) {
                checkpoint();
            } else {
                logger.error("Unknown command received. Ignoring it.");
            }
        }
        Document[] docs = {};
        return docs;
    } else {
        Document[] docs = { doc };
        return docs;
    }
}

From source file:com.flaptor.hounder.indexer.DocumentConverter.java

License:Apache License

/**
 *Converts a hounder's add document into a lucene document.
 *This method is thread safe./*w w  w.  j  a  v  a2 s .  c o m*/
 *@throws IllegalArgumentException if the document is malformed, if it's not an add
 *if it does not contain the required fields, etc.
*/
public org.apache.lucene.document.Document convert(final Document doc) throws IllegalDocumentException {
    Element root = doc.getRootElement();
    if (root.getName().equals("documentAdd")) {
        return processAdd(root);
    } else {
        throw new IllegalDocumentException("This is not an add document.");
    }
}

From source file:com.flaptor.hounder.indexer.FieldFormatCheckerModule.java

License:Apache License

/**
 * Processes the document. Takes the xml document, prints it to the logger,
 * and returns the same document./*from   www  .  j a va  2 s  .c o m*/
 */
protected Document[] internalProcess(final Document doc) {

    // check that this is a documentAdd
    // otherwise, skip.
    Node root = doc.getRootElement();
    if (!root.getName().equals("documentAdd"))
        return new Document[] { doc };

    for (String longField : longFields) {
        Node node = doc.selectSingleNode("//field[@name='" + longField + "']");
        if (null == node) {
            logger.error("Document lacks field " + longField + ". Dropping document. ");
            if (logger.isDebugEnabled()) {
                logger.debug(DomUtil.domToString(doc) + " lacks field " + longField);
            }
            return new Document[0];
        }

        String text = node.getText();
        try {
            Long.parseLong(text);
        } catch (NumberFormatException e) {
            logger.error(
                    "Document has field " + longField + ", but it is not parseable as Long. Dropping document");
            if (logger.isDebugEnabled()) {
                logger.debug(DomUtil.domToString(doc) + " contains field " + longField
                        + " but it is not parseable as Long. Node:" + node.toString() + " - text: " + text);
            }
            return new Document[0];
        }
    }

    // TODO insert more field type checks here
    Document[] docs = { doc };
    return docs;
}

From source file:com.flaptor.hounder.indexer.HtmlParser.java

License:Apache License

/**
 * Parses the html document and extracts the indexable text.
 * /*from   w w  w .java2 s.c  o  m*/
 * @param doc
 *            the dom4j doc to process.
 * @return a single document, in wich some fields may have been added, using
 *         the info from the "body" field. The fields are added at the same
 *         level as the "body" and the original "body" is preserved.
 */
public final Document[] internalProcess(final Document doc) {
    Document[] docs = { doc };

    // check that this is a documentAdd
    // otherwise, skip.
    Element root = doc.getRootElement();
    if (!root.getName().equals("documentAdd"))
        return docs;

    try {
        for (Pair<String, String> tag : tags) {
            processTag(doc, tag.first(), tag.last());
        }
    } catch (Exception e) {
        logger.warn("internalProcess: while running processTag:" + e.getMessage(), e);
        return null;
    }
    return docs;
}

From source file:com.flaptor.hounder.indexer.SanitizerModule.java

License:Apache License

/**
 * @param doc a non null document to process
 *//*from w  w  w .j ava 2s . c  o  m*/
public Document[] internalProcess(final Document doc) {
    Element root = doc.getRootElement();
    if (null != root) {
        for (String name : allFields) {
            Element elem = (Element) root.selectSingleNode(xpath.replace("$", name));
            if (null != elem) {
                try {
                    String text = elem.getText();

                    if (htmlFields.contains(name)) {
                        text = htmlParser.parse("internal document", text.getBytes("UTF-8"), "UTF-8").getText();
                    }

                    if (xmlFields.contains(name)) {
                        text = DomUtil.filterXml(text);
                    }

                    if (accentFields.contains(name)) {
                        text = filterAccents(text);
                    }

                    elem.setText(text);
                } catch (Exception e) {
                    logger.warn("Sanitizing field " + name, e);
                }
            }
        }
    }
    Document[] docs = { doc };
    return docs;
}

From source file:com.flaptor.hounder.indexer.Writer.java

License:Apache License

/**
 * Implementation of the Module interface. Depending on the document
 * received, the Writer can insert or delete a document, or process it as a
 * command./*from ww w.j av a 2s.c  om*/
 * @see Writer
 */
protected Document[] internalProcess(final Document doc) {
    Element root = doc.getRootElement();
    if (root.getName().equals("documentAdd")) {
        try {
            iwp.addDocument(DocumentConverter.getInstance().convert(doc));
        } catch (DocumentConverter.IllegalDocumentException e) {
            logger.error(
                    "Exception while converting this document to lucene. Check the document format. This document "
                            + "won't be added to the index.",
                    e);
        }
    } else if (root.getName().equals("documentDelete")) {
        processDelete(root);
    } else {
        logger.error("Invalid format received");
    }
    return null;
}

From source file:com.flaptor.hounder.searcher.XmlSearchHandler.java

License:Apache License

/**
 * Request can have the following parameters:
 *
 * query/* w ww  .  j a v  a 2  s  .co  m*/
 * start
 * hitsPerPage
 * categories
 * site
 * group = < "site" | "signature">
 * orderBy
 * crawl
 * xsltUri
 * raw true|false
 * this method is a merge of search-base.jsp, opensearch.jsp and http://docs.codehaus.org/display/JETTY/Embedding+Jetty
 */
public void handle(String target, HttpServletRequest request, HttpServletResponse response, int dispatch)
        throws IOException, ServletException {
    response.setCharacterEncoding("utf-8");
    PrintWriter pw = response.getWriter();

    Document originalDom = doQuery(request, searcher);
    originalDom.getRootElement().addAttribute("SearchEngine", "Hounder (hounder.org)")
            .addAttribute("DevelopedBy", "Flaptor (flaptor.com)");

    @SuppressWarnings("unchecked")
    String rawStr = getParameter(request.getParameterMap(), "raw");
    if (Boolean.parseBoolean(rawStr) || transformMap.isEmpty()) {
        response.setContentType("text/xml");
        String openSearchResults = DomUtil.domToString(originalDom);
        pw.print(openSearchResults);
        pw.flush();
    } else {
        Pair<Transformer, String> value = transformMap.get(request.getPathInfo());
        //System.out.println("XML HANDLE: path="+request.getPathInfo()+"  value="+value);
        if (null == value) {
            response.sendError(HttpServletResponse.SC_NOT_FOUND, "There's no xslt to serve this context.");
            return;
        }
        Transformer transformer = value.first();
        String contentType = value.last();

        response.setContentType(contentType);
        try {
            synchronized (transformer) {
                transformer.transform(new DocumentSource(originalDom), new StreamResult(pw));
            }
        } catch (TransformerException e) {
            logger.error(
                    "internalProcess: exception while transforming document. (set error level to debug to see the offending document)",
                    e);
            logger.debug("offending document was: " + DomUtil.domToString(originalDom));
            response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, "Error processing internal xslt.");
            return;
        }
    }
}

From source file:com.flaptor.hounder.util.HtmlParser.java

License:Apache License

/**
 * Parse the given html document./* w ww  .jav a  2s. c o  m*/
 * @param content the html document to parse.
 * @return the parsed string.
 */
public Output parse(String url, String content) throws Exception {
    // <html xmlns=...>  ==>   <html>
    content = REGEXP_HTML.matcher(content).replaceFirst("<html>");
    // Parser keeps state, synchronize in case its used in a multi-threaded setting.
    Output out = new Output(url);
    synchronized (this) {
        try {
            // use cyberneko to parse the html documents (even broken ones)
            org.xml.sax.InputSource inputSource = new org.xml.sax.InputSource(
                    new java.io.ByteArrayInputStream(content.getBytes()));
            parser.parse(inputSource);
        } catch (Exception e) {
            logger.warn("Exception while trying to parse [" + content + "]");
            throw e;
        }
        DOMReader reader = new DOMReader();
        Document htmlDoc;
        try {
            // get the doc that resulted from parsing the text                
            org.w3c.dom.Document document = parser.getDocument();
            htmlDoc = reader.read(document);
        } catch (java.lang.StackOverflowError e) {
            logger.warn("Out of stack memory trying to parse [" + content + "]");
            throw new Exception();
        }
        // this 2 must be before the ignoreXPath, else an ignoreXPath that
        // includes the //TITLE will imply that the title is not indexed
        // extract the links
        extractLinks(htmlDoc, out);

        // extact the title
        extractTitle(htmlDoc, out);

        ignoreXpath(htmlDoc);

        replaceSeparatorTags(htmlDoc);

        // extract the text from the html tags
        extractText(htmlDoc.getRootElement(), out, HTMLPARSER_CONTENT);

        // extract special fields
        extractFields(htmlDoc, out);
    }
    out.close();
    return out;
}

From source file:com.flaptor.util.DomUtil.java

License:Apache License

public static void main(String[] arg) throws Exception {
    String str = FileUtil.readFile(new File(arg[0]));
    HtmlParser parser = new HtmlParser();
    Document htmlDocument = parser.getHtmlDocument("http://url.com", str.getBytes());

    System.out.println(getElementTextRecursively(htmlDocument.getRootElement()));

}