Example usage for org.dom4j Element elementIterator

List of usage examples for org.dom4j Element elementIterator

Introduction

In this page you can find the example usage for org.dom4j Element elementIterator.

Prototype

Iterator<Element> elementIterator(QName qName);

Source Link

Document

Returns an iterator over the elements contained in this element which match the given fully qualified name.

Usage

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Reporting.ReportingUtils.java

License:Open Source License

public static void extractExpressionFromCSVID(String xmlfile, String csvfile, String outfile) throws Exception {
    org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
    org.dom4j.Document document = reader.read(xmlfile);
    org.dom4j.Element root = document.getRootElement();

    CSVReader csvreader = new CSVReader(new FileReader(csvfile), ';');
    HashMap<String, List<String>> hm = new HashMap<String, List<String>>();
    String[] nextLine;//from w  ww.  j  a va  2s . c o m
    while ((nextLine = csvreader.readNext()) != null) {
        hm.put(nextLine[0], new ArrayList<String>());
    }

    for (Iterator i1 = root.elementIterator("blockinfo"); i1.hasNext();) {
        org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

        String id = e1.attribute("id_entity").getValue();
        //if(!id.equals("786643")) continue;
        List<String> al = hm.get(id);
        if (al == null)
            continue;
        int PA = 0, AS = 0;
        if (al.size() != 0) {
            int last = al.size() - 1;
            String[] ss = al.get(last).split("\t");
            PA = Integer.parseInt(ss[0]);
            AS = Integer.parseInt(ss[1]);
            al.remove(last);
        }

        for (Object obj : e1.elements()) {
            org.dom4j.Element exp = (org.dom4j.Element) obj;

            String aux_content = exp.element("Content").getText().replace("\n", " ").replace("\r\n", " ");
            if (!al.contains(exp.getName() + "\t\t" + aux_content + "\r\n")) {
                if (exp.getName().contains("Activity"))
                    PA++;
                if (exp.getName().contains("Studies"))
                    AS++;
                al.add(exp.getName() + "\t\t" + aux_content + "\r\n");
            }

        }
        //content += "\tPA: " + PA + " AS: " + AS + "\r\n";
        al.add(PA + "\t" + AS);
        hm.put(id, al);
    }

    csvreader = new CSVReader(new FileReader(csvfile), ';');

    while ((nextLine = csvreader.readNext()) != null) {
        String key = nextLine[0];
        //System.out.println("ID: " + key + " " + hm.get(key));
        List<String> al = hm.get(key);
        int size = al.size();
        System.out.println(key + "\t" + (size != 0 ? al.get(size - 1) : ""));
        //for(String s : hm.get(key))
        //  System.out.println(s);
    }

}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Reporting.ReportingUtils.java

License:Open Source License

public static void extractExpressions(String xmlfile, String csvfile, String outfile) throws Exception {
    org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
    org.dom4j.Document document = reader.read(xmlfile);
    org.dom4j.Element root = document.getRootElement();

    CSVReader csvreader = new CSVReader(new FileReader(csvfile), ';');
    HashMap<String, String> hm = new HashMap<String, String>();
    String[] nextLine;/*www  .ja  v  a2s. com*/
    while ((nextLine = csvreader.readNext()) != null) {
        //hm.put(nextLine[0], "");
    }

    for (Iterator i1 = root.elementIterator("blockinfo"); i1.hasNext();) {
        org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

        String id = e1.attribute("id_entity").getValue();
        //if(!id.equals("786643")) continue;

        int PA = 0, AS = 0;

        for (Object obj : e1.elements()) {
            org.dom4j.Element exp = (org.dom4j.Element) obj;

            if (exp.getName().contains("Activity")) {
                hm.put(exp.element("Title_name").getText(), "");
            }
        }

    }

    csvreader = new CSVReader(new FileReader(csvfile), ';');

    for (String key : hm.keySet()) {
        System.out.println(key);
    }

}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.CleanerResearchersWebpages.java

License:Open Source License

/**
 * /*from  w w w. j a  v  a 2  s . c o  m*/
 * @param elementResearcher
 * @param path
 * @param sInstitutionName
 * @param sWebAddress
 * @param sUnitOfAssessment_Description
 * @param sResearchGroupDescription
 * @param sResearchName
 * @param sResearchInitials
 * @param sStaffIndentifier
 * @return  
 */
@Override
protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName,
        String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription,
        ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
    for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) {
        org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

        String url = e5.getText();
        if (!url.equals("")) {
            String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT);
            if (ext == null || ext == "")
                ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML;
            String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE);
            if (type == null || type == "")
                ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV;

            String filename = ResearchersPagePostProcessor.getHashFileName(type, url, ext);

            ResearchersPagePostProcessor.cleanFile(ResearchersPagePostProcessor.getCleanerProperties(), path,
                    filename, filename);
        }
    }

    return true;
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.DownloaderResearchersWebPagesXMLFormat.java

License:Open Source License

public static void downloadResearchesPages(String destDir, LocalFormatType downloading_format_type,
        org.dom4j.Element elInstitution, boolean redownload) {
    try {//w ww. j a  v  a  2  s  .c om
        String sInstitutionName = "";
        String sUnitOfAssessment_Description = "";
        String sResearchGroupDescription = "";
        String sResearchName = "";
        String sResearchInitials = "";

        org.dom4j.Element e1 = elInstitution; //(org.dom4j.Element) i1.next();

        sInstitutionName = e1.element(XMLTags.INSTITUTION_NAME).getText();

        File dirBase = null;
        dirBase = DownloaderResearchersWebPagesXMLFormat.giveMeTheDirectory(destDir, false);

        File dirI = null;
        if (downloading_format_type.equals(LocalFormatType.TREE_DIRECTORY))
            dirI = DownloaderResearchersWebPagesXMLFormat.giveMeTheDirectory(
                    destDir + "\\" + sInstitutionName.replaceAll("[^a-z^A-Z]", "") + "\\", redownload);

        for (Iterator i2 = e1.elementIterator(XMLTags.UNIT_OF_ASSESSMENT); i2.hasNext();) {
            org.dom4j.Element e2 = (org.dom4j.Element) i2.next();

            sUnitOfAssessment_Description = e2.element(XMLTags.UNIT_OF_ASSESSMENT_DESCRIPTION).getText();

            File dirUAD = null;
            if (downloading_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                dirUAD = DownloaderResearchersWebPagesXMLFormat.giveMeTheDirectory(dirI.getPath() + "\\"
                        + sUnitOfAssessment_Description.replaceAll("[^a-z^A-Z]", "") + "\\", redownload);

            for (Iterator i3 = e2.elementIterator(XMLTags.RESEARCHGROUP); i3.hasNext();) {
                org.dom4j.Element e3 = (org.dom4j.Element) i3.next();
                sResearchGroupDescription = e3.element(XMLTags.RESEARCHGROUP_DESCRIPTION).getText();

                for (Iterator i4 = e3.elementIterator(XMLTags.RESEARCHER); i4.hasNext();) {
                    org.dom4j.Element e4 = (org.dom4j.Element) i4.next();

                    sResearchName = e4.element(XMLTags.RESEARCHER_LASTNAME).getText();
                    sResearchInitials = e4.element(XMLTags.RESEARCHER_INITIALS).getText();

                    String sAux = sResearchName.replaceAll("[^a-z^A-Z]", "") + "#"
                            + sResearchInitials.replaceAll("[^a-z^A-Z]", "");

                    File dirR = null;
                    if (downloading_format_type.equals(LocalFormatType.TREE_DIRECTORY))
                        dirR = DownloaderResearchersWebPagesXMLFormat
                                .giveMeTheDirectory(dirUAD.getPath() + "\\" + sAux + "\\", false);
                    else
                        dirR = dirBase;

                    for (Iterator i5 = e4.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext();) {
                        org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                        String url = e5.getText();
                        if (!url.equals("")) {
                            String ext = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT);
                            if (ext == null || ext == "")
                                ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_EXT_VALUE_DEFAULT_HTML;
                            String type = e5.attributeValue(XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE);
                            if (type == null || type == "")
                                ext = XMLTags.RESEARCHER_WEB_ADDRESS_ATTR_TYPE_VALUE_DEFAULT_CV;

                            String fileDownloaded = ResearchersPagePostProcessor
                                    .downloadAndClean(dirR.getAbsolutePath(), type, url, ext, true, redownload);
                            fileDownloaded = fileDownloaded;
                        }
                    }
                }
            }
        }
    } catch (Exception ex) {
        ProjectLogger.LOGGER.error("ERROR: " + ex.getMessage());
    }
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.DownloaderResearchersWebPagesXMLFormat.java

License:Open Source License

public static void downloadAllResearchersPagesWithThreads(String xmlFile, String destDir,
        LocalFormatType downloading_format_type, int numberThreads, boolean redownload) {
    try {/*from   w w  w. ja v a 2  s  . c  om*/
        org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
        org.dom4j.Document document = reader.read(xmlFile);
        org.dom4j.Element root = document.getRootElement();

        //if(redownload)
        //    FileFootils.deleteDir(destDir + "\\");

        String sInstitutionName = "";
        String sUnitOfAssessment_Description = "";
        String sResearchGroupDescription = "";
        String sResearchName = "";
        String sResearchInitials = "";

        File dir = new File(destDir + "\\");
        if (!dir.exists()) {
            if (!dir.mkdir())
                throw new Exception("Cant create " + dir.getPath());
        }

        MonitorThread monitor = new MonitorThread(numberThreads);

        //downloaderResearchesPages[] ath = new DownloaderResearchersWebPagesXMLFormat[numberThreads];
        //for(int i = 0; i < ath.length; i++) ath[i] = null;

        boolean bExit = false;
        boolean bAnyWorks = true;
        for (Iterator i1 = root.elementIterator("Institution"); i1.hasNext() && !bExit;) {
            if (monitor.canCreateNewThread()) {
                DownloaderResearchersWebPagesXMLFormat ath = new DownloaderResearchersWebPagesXMLFormat(destDir,
                        downloading_format_type, (org.dom4j.Element) i1.next(), monitor, redownload);
                ath.start();
            } else {
                sleep(5000);
            }
        }
    } catch (Exception ex) {
        //
    }
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.DownloaderResearchersWebPagesXMLFormat.java

License:Open Source License

public static void downloadAllResearchersPages(String xmlFile, String destDir,
        LocalFormatType downloading_format_type, boolean redownload) {
    try {//from  w  ww . j a  v  a  2s  . c  o  m
        //if(true) return;
        org.dom4j.io.SAXReader reader = new org.dom4j.io.SAXReader();
        org.dom4j.Document document = reader.read(xmlFile);
        org.dom4j.Element root = document.getRootElement();

        FileFootils.deleteDir(destDir + "\\");

        String sInstitutionName = "";
        String sUnitOfAssessment_Description = "";
        String sResearchGroupDescription = "";
        String sResearchName = "";
        String sResearchInitials = "";

        File dir = new File(destDir + "\\");
        if (!dir.mkdir())
            throw new Exception("Cant create " + dir.getPath());

        for (Iterator i1 = root.elementIterator("Institution"); i1.hasNext();) {
            org.dom4j.Element e1 = (org.dom4j.Element) i1.next();

            DownloaderResearchersWebPagesXMLFormat.downloadResearchesPages(destDir, downloading_format_type, e1,
                    redownload);
        }
    } catch (Exception ex) {
        ProjectLogger.LOGGER.error(ex.getMessage());
    }

}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsInFolder.java

License:Open Source License

/**
 * Reader folder of one researcher and takes the uri of clean file for to make infoblock. 
 * @param elementResearcher/* www.  j a  v a2s .c o  m*/
 * @param path
 * @param sInstitutionName
 * @param sWebAddress
 * @param sUnitOfAssessment_Description
 * @param sResearchGroupDescription
 * @param sResearchName
 * @param sResearchInitials
 * @param sStaffIndentifier
 */
@Override
protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName,
        String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription,
        ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
    File fAux = new File(path);

    File[] adirRW = fAux.listFiles();

    if (adirRW != null) {
        for (File file : adirRW) {
            if (!file.isDirectory())
                continue;
            boolean b = false;
            for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) {
                org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                String sURL = e5.getText();

                byte[] bytes = sURL.getBytes();
                String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5));

                if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) {
                    b = true;
                    File dirFinalFiles = new File(file.getPath());
                    File[] afinalFiles = dirFinalFiles.listFiles();

                    //Search all clean files
                    for (File finalFile : afinalFiles) {
                        if (finalFile.getName().contains("clean_")) {
                            //FIXME
                            if (!finalFile.getName().contains("pub"))
                                if (finalFile.exists()) {
                                    rootOut.addElement("infoblock")
                                            .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT,
                                                    sStaffIndentifier)
                                            .addAttribute(
                                                    DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT,
                                                    DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER)
                                            .addAttribute(
                                                    DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING,
                                                    DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC)
                                            .addText(finalFile.getAbsolutePath());
                                }
                        }
                    }
                }
            }
            if (!b)
                Logger.getLogger("MyLog").warning("FILE MISSED: " + file.getPath());
        }
    }
    return true;
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsOnXMLFileForTextMiningCreator.java

License:Open Source License

/**
 * Reader folder of one researcher and takes the uri of clean file for to make infoblock. 
 * Note: Read comment in top of file./*w ww.j a v a  2 s.co m*/
 * @param elementResearcher
 * @param path
 * @param sInstitutionName
 * @param sWebAddress
 * @param sUnitOfAssessment_Description
 * @param sResearchGroupDescription
 * @param sResearchName
 * @param sResearchInitials
 * @param sStaffIndentifier
 */
@Override
protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName,
        String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription,
        ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
    File fAux = new File(path);

    File[] adirRW = fAux.listFiles();

    if (adirRW != null) {
        for (File file : adirRW) {
            for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) {
                org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                String sURL = e5.getText();

                byte[] bytes = sURL.getBytes();
                String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5));

                if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) {
                    File dirFinalFiles = new File(file.getPath());
                    File[] afinalFiles = dirFinalFiles.listFiles();

                    //Search all clean files
                    for (File finalFile : afinalFiles) {
                        if (finalFile.getName().contains("clean_")) {
                            //FIXME
                            if (!finalFile.getName().contains("pub"))
                                if (finalFile.exists()) {
                                    rootOut.addElement("infoblock")
                                            .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT,
                                                    sStaffIndentifier)
                                            .addAttribute(
                                                    DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT,
                                                    DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER)
                                            .addAttribute(
                                                    DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING,
                                                    DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC)
                                            .addText(finalFile.getAbsolutePath());
                                }
                        }
                    }
                } else {
                    Logger.getLogger("MyLog").warning(
                            "DIR NOT EQUAL: " + file.getPath() + " != " + sAuxxx.replace("\\\\", "\\"));
                }
            }
        }
    }

    return true;
}

From source file:eu.sisob.uma.crawler.ResearchersCrawlers.Workers.ExportDocumentsOnXMLFileForTextMiningCreatorWithFilter.java

License:Open Source License

/**
 * /*  ww w  .j av  a  2  s .  c om*/
 * @param elementResearcher
 * @param path
 * @param sInstitutionName
 * @param sWebAddress
 * @param sUnitOfAssessment_Description
 * @param sResearchGroupDescription
 * @param sResearchName
 * @param sResearchInitials
 * @param sStaffIndentifier
 */
@Override
protected boolean actionsInResearcherNode(Element elementResearcher, String path, String sInstitutionName,
        String sWebAddress, String sUnitOfAssessment_Description, String sResearchGroupDescription,
        ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
    File fAux = new File(path);

    File[] adirRW = fAux.listFiles();

    if (adirRW != null) {
        for (File file : adirRW) {
            for (Iterator i5 = elementResearcher.elementIterator("ResearcherWebAddress"); i5.hasNext();) {
                org.dom4j.Element e5 = (org.dom4j.Element) i5.next();

                String sURL = e5.getText();

                byte[] bytes = sURL.getBytes();
                String sAuxxx = path + "\\" + Integer.toHexString(MurmurHash.hash(bytes, 5));

                if (file.getPath().equals(sAuxxx.replace("\\\\", "\\"))) {
                    File fileURL = new File(file.getPath() + "\\clean_index.html");

                    boolean b = true;
                    org.jsoup.nodes.Document doc = null;
                    try {
                        //                                                org.jsoup.nodes.Document doc2 = Jsoup.connect(sURL).get();
                        doc = Jsoup.parse(fileURL, "UTF-8", sURL);

                        org.jsoup.select.Elements els = doc.body().select(xpathExp);
                        //(":containsOwn(" + sLiteralExp + ")");
                        //logger.info("URL: " + fileURL.getPath() + " uri: " + doc.baseUri());
                        //logger.log(Level.INFO, fileURL.getAbsolutePath());
                        if (els.size() > 0) {
                            //logger.info(fileURL.getAbsolutePath());
                            Logger.getLogger("MyLog").log(Level.INFO,
                                    "(" + xpathExp + ") => " + fileURL.getAbsolutePath());
                            hitsTable[0][0]++;
                            rootOut.addElement("infoblock")
                                    .addAttribute(DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ENTITY_ATT,
                                            sStaffIndentifier)
                                    .addAttribute(
                                            DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_TEXTMININGPARSER_ATT,
                                            DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER)
                                    .addAttribute(
                                            DataExchangeLiterals.MIDDLE_ELEMENT_XML_ID_ANNOTATIONRECOLLECTING,
                                            DataExchangeLiterals.ID_TEXTMININGPARSER_GATERESEARCHER_DEFAULTANNREC)
                                    .addText(fileURL.getAbsolutePath());

                            sContentForView += sURL + " - " + els.first().text() + " - "
                                    + (els.first().absUrl("href")) + "\r\n\r\n";
                        } else
                            hitsTable[1][0]++;
                    } catch (Exception ex) {
                        Logger.getLogger("MyLog").log(Level.SEVERE,
                                "ERROR URL: " + fileURL.getPath() + " Msg: " + ex.getMessage() + ".");
                    }
                }
            }
        }
    }
    return true;
}

From source file:eu.sisob.uma.crawlerWorks.WebPagesOfUniversities.Format.ConversorFromXMLtoCSV.java

License:Open Source License

@Override
protected boolean actionsInResearcherNode(org.dom4j.Element elementResearcher, String path,
        String sInstitutionName, String sWebAddress, String sUnitOfAssessment_Description,
        String sResearchGroupDescription, ResearcherNameInfo researcherNameInfo, String sStaffIndentifier) {
    boolean has = false;
    for (Iterator i5 = elementResearcher.elementIterator(XMLTags.RESEARCHER_WEB_ADDRESS); i5.hasNext();) {
        org.dom4j.Element e5 = (org.dom4j.Element) i5.next();
        String url = e5.getText();
        if (!url.equals("")) {
            has = true;/*from w  w  w.  j a v  a  2  s  .c  o  m*/
            break;
        }
    }

    if (!has) {
        String[] entries = new String[7];
        entries[0] = sStaffIndentifier;
        entries[1] = researcherNameInfo.whole_name;
        entries[2] = researcherNameInfo.first_name;
        entries[3] = researcherNameInfo.last_name;
        entries[4] = researcherNameInfo.initial;
        entries[5] = sUnitOfAssessment_Description;
        entries[6] = sInstitutionName;

        this.writer_nofound.writeNext(entries);
    }
    return true;
}