Example usage for org.jsoup.select Elements Elements

List of usage examples for org.jsoup.select Elements Elements

Introduction

In this page you can find the example usage for org.jsoup.select Elements Elements.

Prototype

public Elements() 

Source Link

Usage

From source file:module.entities.NameFinder.RegexNameFinder.java

/**
 * @param args the command line arguments
 *///from  w  ww .j  a v  a  2  s .  c om
public static void main(String[] args) throws SQLException, IOException {

    if (args.length == 1) {
        Config.configFile = args[0];
    }
    long lStartTime = System.currentTimeMillis();
    Timestamp startTime = new Timestamp(lStartTime);
    System.out.println("Regex Name Finder process started at: " + startTime);
    DB.initPostgres();
    regexerId = DB.LogRegexFinder(lStartTime);
    initLexicons();
    JSONObject obj = new JSONObject();
    TreeMap<Integer, String> consultations = DB.getDemocracitConsultationBody();
    Document doc;
    int count = 0;
    TreeMap<Integer, String> consFoundNames = new TreeMap<>();
    TreeMap<Integer, String> consFoundRoles = new TreeMap<>();
    for (int consId : consultations.keySet()) {
        String consBody = consultations.get(consId);
        String signName = "", roleName = "";
        doc = Jsoup.parse(consBody);
        Elements allPars = new Elements();
        Elements paragraphs = doc.select("p");
        for (Element par : paragraphs) {
            if (par.html().contains("<br>")) {
                String out = "<p>" + par.html().replaceAll("<br>", "</p><p>") + "</p>";
                Document internal_doc = Jsoup.parse(out);
                Elements subparagraphs = internal_doc.select("p");
                allPars.addAll(subparagraphs);
            } else {
                allPars.add(par);
            }
            //                System.out.println(formatedText);
        }
        String signature = getSignatureFromParagraphs(allPars);
        //            System.out.println(signature);
        if (signature.contains("#")) {
            String[] sign_tokens = signature.split("#");
            signName = sign_tokens[0];
            if (sign_tokens.length > 1) {
                roleName = sign_tokens[1];
            }
            consFoundNames.put(consId, signName.trim());
            consFoundRoles.put(consId, roleName.trim());
            count++;
        } else {
            System.err.println("--" + consId);
        }
        //           
    }
    DB.insertDemocracitConsultationMinister(consFoundNames, consFoundRoles);

    TreeMap<Integer, String> consultationsCompletedText = DB.getDemocracitCompletedConsultationBody();
    Document doc2;
    TreeMap<Integer, String> complConsFoundNames = new TreeMap<>();
    int count2 = 0;
    for (int consId : consultationsCompletedText.keySet()) {
        String consBody = consultationsCompletedText.get(consId);
        String signName = "", roleName = "";
        doc2 = Jsoup.parse(consBody);
        //            if (doc.text().contains("<br>")) {
        //                doc.text().replaceAll("(<[Bb][Rr]>)+", "<p>");
        //            }
        Elements allPars = new Elements();
        Elements paragraphs = doc2.select("p");
        for (Element par : paragraphs) {

            if (par.html().contains("<br>")) {
                String out = "<p>" + par.html().replaceAll("<br>", "</p><p>") + "</p>";
                Document internal_doc = Jsoup.parse(out);
                Elements subparagraphs = internal_doc.select("p");
                allPars.addAll(subparagraphs);
            } else {
                allPars.add(par);
            }
        }
        String signature = getSignatureFromParagraphs(allPars);
        if (signature.contains("#")) {
            String[] sign_tokens = signature.split("#");
            signName = sign_tokens[0];
            if (sign_tokens.length > 1) {
                roleName = sign_tokens[1];
            }
            consFoundNames.put(consId, signName.trim());
            consFoundRoles.put(consId, roleName.trim());
            //                System.out.println(consId);
            //                System.out.println(signName.trim());
            //                System.out.println("***************");
            count2++;
        } else {
            System.err.println("++" + consId);
        }
    }
    DB.insertDemocracitConsultationMinister(complConsFoundNames, consFoundRoles);
    long lEndTime = System.currentTimeMillis();
    System.out.println("Regex Name Finder process finished at: " + startTime);
    obj.put("message", "Regex Name Finder finished with no errors");
    obj.put("details", "");
    DB.UpdateLogRegexFinder(lEndTime, regexerId, obj);
    DB.close();
}

From source file:net.slkdev.swagger.confluence.service.impl.XHtmlToConfluenceServiceImpl.java

private Map<String, ConfluenceLink> buildTableOfContentsLinkMap() {
    final Map<String, ConfluenceLink> titleLinkMap = new HashMap<>();

    final Document document = SWAGGER_DOCUMENT.get();
    final Elements tocElements = document.select(".toc");

    final Elements tocCategoryElements = tocElements.select(".sectlevel1").first().children();

    final Elements tocFilteredCategoryElements = new Elements();

    for (final Element tocCategoryElement : tocCategoryElements) {
        final Element categoryLinkElement = tocCategoryElement.children().first();
        tocFilteredCategoryElements.add(categoryLinkElement);
    }// w  w w .j a va2  s  .c o  m

    final Elements tocIndividualElements = tocElements.select(".sectlevel2");

    addLinksByType(titleLinkMap, tocFilteredCategoryElements, PageType.CATEGORY, null);

    int categoryCount = 1;

    for (final Element tocIndividualElement : tocIndividualElements) {
        final Elements tocIndividualElementLinks = tocIndividualElement.select("a");
        addLinksByType(titleLinkMap, tocIndividualElementLinks, INDIVIDUAL, categoryCount);
        categoryCount++;
    }

    return titleLinkMap;
}

From source file:be.ibridge.kettle.jsoup.JsoupInput.java

private void parseJsoup() throws Exception {

    // Read JSOUP source
    if (data.file != null) {
        data.jsoupReader = Jsoup.parse(new File(data.filename), "UTF-8");
    } else {/*ww  w.j  a  va 2 s  . c  om*/
        if (meta.isReadUrl()) {
            data.jsoupReader = Jsoup.parse(new URL(data.stringToParse), 1000);
        } else {
            // read string
            data.jsoupReader = Jsoup.parse(data.stringToParse);
        }
    }
    List<Elements> resultList = new ArrayList<Elements>();
    data.nrrecords = -1;
    data.recordnr = 0;
    String prevPath = "";
    for (int i = 0; i < data.nrInputFields; i++) {
        String path = meta.getInputFields()[i].getPath();
        Elements ja = data.jsoupReader.select(path);
        if (ja.size() > 0 && (data.nrrecords != -1 && data.nrrecords != ja.size() && ja != null)) {
            throw new KettleException(BaseMessages.getString(PKG, "JsoupInput.Error.BadStructure", ja.size(),
                    path, prevPath, data.nrrecords));
        }
        resultList.add(ja);
        if (data.nrrecords == -1 && ja != null) {
            data.nrrecords = ja.size();
        }
        prevPath = path;
    }

    data.resultList = new ArrayList<Elements>();

    Iterator<Elements> it = resultList.iterator();

    while (it.hasNext()) {
        Elements j = it.next();
        if (j == null || j.size() == 0) {
            if (data.nrrecords == -1) {
                data.nrrecords = 1;
            }
            // The object is empty means that we do not
            // find Jsoup path
            // We need here to create a dummy structure
            j = new Elements();
            for (int i = 0; i < data.nrrecords; i++) {
                j.add(null);
            }
        }
        data.resultList.add(j);
    }
    resultList = null;

    if (log.isDetailed()) {
        logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.NrRecords", data.nrrecords));
    }

}

From source file:eu.sisob.uma.extractors.adhoc.cvfilesinside.InternalCVFilesExtractor.java

/**
 *
 * @param input_file//from  w  ww  .j a  v  a  2  s  .c  o  m
 * @param data_dir
 * @param output_file
 * @param error_sw
 */
public static void extract_cv_files(File input_file, File data_dir,
        File output_file/*, File output_file_2, File results_dir,*/, StringWriter error_sw) {
    CSVReader reader = null;
    try {
        reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR);
    } catch (FileNotFoundException ex) {
        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
    }

    int idStaffIdentifier = -1;
    int idName = -1;
    int idFirstName = -1;
    int idLastName = -1;
    int idInitials = -1;
    int idUnitOfAssessment_Description = -1;
    int idInstitutionName = -1;
    int idWebAddress = -1;
    int idResearchGroupDescription = -1;
    int idResearcherWebAddress = -1;
    int idResearcherWebAddressType = -1;
    int idResearcherWebAddressExt = -1;
    int idScoreUrl = -1;
    int idEmail = -1;
    int idScoreEmail = -1;

    String[] nextLine;
    try {
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes            
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(FileFormatConversor.CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL))
                    idScoreUrl = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_EMAIL))
                    idEmail = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_EMAIL))
                    idScoreEmail = i;
            }
        }
    } catch (Exception ex) {
        String error_msg = "Error reading headers of " + input_file.getName();
        Logger.getRootLogger().error(error_msg + " - " + ex.toString());
        if (error_sw != null)
            error_sw.append(error_msg + "\r\n");

        return;
    }

    if (idResearcherWebAddress != -1 && idResearcherWebAddressType != -1 && idResearcherWebAddressExt != -1
            && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
        if (true) {
            try {
                String header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_FIRSTNAME + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                if (idEmail != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR;
                if (idScoreEmail != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"" + CSV_SEPARATOR;
                header += "\r\n";
                FileUtils.write(output_file, header, "UTF-8", false);
                // DOWNLOAD HERE THE HOME PAGE 
                //FileUtils.write(output_file_2, header, "UTF-8", false);

            } catch (IOException ex) {
                Logger.getLogger("root").error(ex.toString());
                error_sw.append("Error creating output files\r\n");
            }
        }

        try {
            //                DOWNLOAD HERE THE HOME PAGE 
            //                if(!results_dir.exists())
            //                    results_dir.mkdirs();                
            //                File homepage_results_dirs = new File(results_dir, "HOMEPAGE");
            //                if(!homepage_results_dirs.exists())
            //                    homepage_results_dirs.mkdirs();
            //if(!test_only_output)
            {
                Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");

                while ((nextLine = reader.readNext()) != null) {
                    nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    if (idFirstName != -1)
                        nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ")
                                .toLowerCase();
                    if (idName != -1)
                        nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase();

                    Document content = null;
                    String researcher_page_url = nextLine[idResearcherWebAddress];
                    File temp_file = null;
                    if (p1.matcher(researcher_page_url).matches()) {

                    } else {

                        try {

                            Logger.getRootLogger().info("Reading " + researcher_page_url);

                            temp_file = File.createTempFile("internal-cv-files-", ".tmp");
                            URL fetched_url = Downloader.fetchURL(researcher_page_url);
                            FileUtils.copyURLToFile(fetched_url, temp_file);
                            long sizeInBytes = temp_file.length();
                            long sizeInMb = sizeInBytes / (1024 * 1024);
                            if (sizeInMb > 100) {
                                content = null;
                            } else {
                                String text_content = FileUtils.readFileToString(temp_file);
                                String check_string = "";
                                if (text_content.length() <= 100) {
                                    check_string = text_content.substring(0, text_content.length());
                                } else {
                                    check_string = text_content.substring(0, 100);
                                }
                                if (check_string.toLowerCase().contains("html")) {
                                    content = Jsoup.parse(text_content);
                                    content.setBaseUri(researcher_page_url);
                                    //                                          DOWNLOAD HERE THE HOME PAGE                                        
                                    //                                        String filename = nextLine[idStaffIdentifier] + "_HOMEPAGE_" + MD5(researcher_page_url) + ".html";
                                    //                                        FileUtils.copyFile(temp_file, new File(homepage_results_dirs, filename));                                        
                                    //                                        
                                    //                                        String result = "";                        
                                    //                                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;                                    
                                    //                                        if(idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;  
                                    //                                        if(idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;  
                                    //                                        if(idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; 
                                    //                                        if(idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;  
                                    //                                        if(idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;                 
                                    //                                        result += "\"" + filename + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR;
                                    //                                        if(idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; 
                                    //                                        result += "\r\n";
                                    //
                                    //                                        try {
                                    //                                            FileUtils.write(output_file_2, result, "UTF-8", true);
                                    //                                        } catch (IOException ex) {
                                    //                                            Logger.getLogger("root").error(ex.toString());
                                    //                                        }
                                } else {
                                    throw new Exception(researcher_page_url + " is not html document");
                                }
                            }

                        } catch (Exception ex) {
                            Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = null;
                        } catch (java.lang.OutOfMemoryError ex2) {
                            Logger.getLogger("root")
                                    .error("" + researcher_page_url + " could not loaded (out of memory)", ex2);
                            error_sw.append("" + researcher_page_url + " could not loaded (out of memory)");
                            content = null;
                        } finally {
                            if (temp_file != null)
                                temp_file.delete();
                        }

                    }
                    //Add sources to output
                    {
                        String result = "";
                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                        if (idFirstName != -1)
                            result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                        if (idName != -1)
                            result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                        if (idEmail != -1)
                            result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR;
                        if (idInstitutionName != -1)
                            result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                        if (idWebAddress != -1)
                            result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                        result += "\"HOMEPAGE\"" + CSV_SEPARATOR;
                        result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR;
                        if (idScoreEmail != -1)
                            result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR;
                        result += "\r\n";

                        try {
                            FileUtils.write(output_file, result, "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    }

                    if (content != null) {

                        Elements links = content.select("a[href]");
                        Elements links_worepeat = new Elements();

                        for (Element link : links) {

                            boolean b = false;
                            for (Element link_worepeat : links_worepeat) {
                                if (link.absUrl("href").equals(link_worepeat.absUrl("href"))) {
                                    b = true;
                                    break;
                                }
                            }

                            if (!b)
                                links_worepeat.add(link);

                        }

                        for (Element link : links_worepeat) {

                            boolean b = false;
                            link.setBaseUri(researcher_page_url);
                            String clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                            for (String k : cv_keywords_in_name_list) {
                                if (clean_name_1.contains(k)) {
                                    b = true;
                                    break;
                                }
                            }
                            if (b) {
                                Logger.getRootLogger()
                                        .info("CV found " + link.absUrl("href") + " (" + link.text() + ")");
                                String href = link.absUrl("href");

                                String ext = "";
                                String score = "";
                                String type = "CV";

                                if (link.absUrl("href").endsWith(".pdf"))
                                    ext = "PDF";
                                else if (link.absUrl("href").endsWith(".doc"))
                                    ext = "DOC";
                                else if (link.absUrl("href").endsWith(".docx"))
                                    ext = "DOCX";
                                else if (link.absUrl("href").endsWith(".rtf"))
                                    ext = "RTF";
                                else if (link.absUrl("href").endsWith(".txt"))
                                    ext = "TXT";
                                else
                                    ext = "HTML";

                                if (ext.equals("HTML")) {
                                    score = "B";
                                } else {
                                    score = "A";
                                }

                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                if (idEmail != -1)
                                    result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + ext + "\"" + CSV_SEPARATOR;
                                result += "\"" + type + "\"" + CSV_SEPARATOR;
                                result += "\"" + score + "\"" + CSV_SEPARATOR;
                                if (idScoreEmail != -1)
                                    result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR;
                                result += "\r\n";

                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }

                            }

                            b = false;
                            link.setBaseUri(researcher_page_url);
                            clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                            for (String k : pub_keywords_in_name_list) {
                                if (clean_name_1.contains(k)) {
                                    b = true;
                                    break;
                                }
                            }
                            if (b) {
                                Logger.getRootLogger()
                                        .info("PUB found " + link.absUrl("href") + " (" + link.text() + ")");
                                String href = link.absUrl("href");

                                String ext = "";
                                String score = "";
                                String type = "PUB";

                                if (link.absUrl("href").endsWith(".pdf"))
                                    ext = "PDF";
                                else if (link.absUrl("href").endsWith(".doc"))
                                    ext = "DOC";
                                else if (link.absUrl("href").endsWith(".docx"))
                                    ext = "DOCX";
                                else if (link.absUrl("href").endsWith(".rtf"))
                                    ext = "RTF";
                                else if (link.absUrl("href").endsWith(".txt"))
                                    ext = "TXT";
                                else
                                    ext = "HTML";

                                if (ext.equals("HTML")) {
                                    score = "-";
                                } else {
                                    score = "-";
                                }

                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                if (idEmail != -1)
                                    result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + ext + "\"" + CSV_SEPARATOR;
                                result += "\"" + type + "\"" + CSV_SEPARATOR;
                                result += "\"" + score + "\"" + CSV_SEPARATOR;
                                if (idScoreEmail != -1)
                                    result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR;
                                result += "\r\n";

                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }

                            }
                        }

                    }
                }

                reader.close();

            }

            //                    reader = null;
            //                    try {
            //                        reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR);
            //                    } catch (FileNotFoundException ex) {
            //                        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
            //                    }
            //
            //                    reader.readNext();
            //
            //                    int newIdResearcherWebpage = 3;
            //                    if(idFirstName != -1) newIdResearcherWebpage++; 
            //                    if(idName != -1) newIdResearcherWebpage++; 
            //                    if(idEmail != -1) newIdResearcherWebpage++; 
            //                    if(idInstitutionName != -1) newIdResearcherWebpage++; 
            //                    if(idWebAddress != -1) newIdResearcherWebpage++; 
            //
            //                    List<Object[]> urls_times = new ArrayList<Object[]>();
            //                    while ((nextLine = reader.readNext()) != null) 
            //                    {
            //                        String url = nextLine[newIdResearcherWebpage];
            //
            //                        Object[] url_time = new Object[2];
            //                        url_time[0] = url;
            //                        boolean b = false;
            //                        for(Object[] u : urls_times){
            //                            if(u[0].equals(url_time[0])){
            //                                u[1] = (Integer)u[1] + 1;         
            //                                b = true;
            //                                break;
            //                            }
            //                        }
            //
            //                        if(!b){
            //                            url_time[1] = new Integer(1);
            //                            urls_times.add(url_time);
            //                        }
            //                    }            
            //
            //                    reader.close();                    

            //                try {
            //                    reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR);
            //                } catch (FileNotFoundException ex) {
            //                    Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
            //                }
            //
            //                nextLine = reader.readNext();
            //                try {
            //                    for(int i = 0; i < nextLine.length; i++)
            //                        nextLine[i] = "\"" + nextLine[i] + "\"";
            //                    FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", false);
            //                } catch (IOException ex) {
            //                    Logger.getLogger("root").error(ex.toString());
            //                }
            //                
            //                while ((nextLine = reader.readNext()) != null) 
            //                {
            //                    String url = nextLine[newIdResearcherWebpage];
            //                    boolean b = false;
            //                    for(Object[] u : urls_times){
            //                        if(u[0].equals(url) && ((Integer)u[1] == 1)){                                
            //                            b = true;
            //                            break;
            //                        }
            //                    }
            //                    
            //                    if(b){
            //                        try {
            //                            for(int i = 0; i < nextLine.length; i++)
            //                                nextLine[i] = "\"" + nextLine[i] + "\"";
            //                            FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", true);
            //                        } catch (IOException ex) {
            //                            Logger.getLogger("root").error(ex.toString());
            //                        }
            //                    }
            //                }
            //                
            //                 reader.close();  

        } catch (Exception ex) {
            String error_msg = "Error extracting cv files from extractor " + input_file.getName();
            Logger.getRootLogger().error(error_msg + " - " + ex.toString());
            if (error_sw != null)
                error_sw.append(error_msg + "\r\n");
            return;
        }
    }
}

From source file:com.screenslicer.core.util.BrowserUtil.java

private static WebElement toElement(Browser browser, HtmlNode htmlNode, Element body, boolean recurse)
        throws ActionFailed {
    if (body == null) {
        body = BrowserUtil.openElement(browser, true, null, null, null, null);
    }/*  w ww . j av  a2 s .co  m*/
    if (!CommonUtil.isEmpty(htmlNode.id)) {
        Elements elements = body.getElementsByAttributeValue("id", htmlNode.id);
        if (elements.size() == 1) {
            WebElement element = toElement(browser, elements.get(0), htmlNode, recurse);
            if (element != null) {
                return element;
            }
        }
    }
    List<Elements> selected = new ArrayList<Elements>();
    if (!CommonUtil.isEmpty(htmlNode.tagName)) {
        selected.add(body.getElementsByTag(htmlNode.tagName));
    } else if (!CommonUtil.isEmpty(htmlNode.href)) {
        selected.add(body.getElementsByTag("a"));
    }
    if (!CommonUtil.isEmpty(htmlNode.id)) {
        selected.add(body.getElementsByAttributeValue("id", htmlNode.id));
    }
    if (!CommonUtil.isEmpty(htmlNode.name)) {
        selected.add(body.getElementsByAttributeValue("name", htmlNode.name));
    }
    if (!CommonUtil.isEmpty(htmlNode.type)) {
        selected.add(body.getElementsByAttributeValue("type", htmlNode.type));
    }
    if (!CommonUtil.isEmpty(htmlNode.value)) {
        selected.add(body.getElementsByAttributeValue("value", htmlNode.value));
    }
    if (!CommonUtil.isEmpty(htmlNode.title)) {
        selected.add(body.getElementsByAttributeValue("title", htmlNode.title));
    }
    if (!CommonUtil.isEmpty(htmlNode.role)) {
        selected.add(body.getElementsByAttributeValue("role", htmlNode.role));
    }
    if (!CommonUtil.isEmpty(htmlNode.alt)) {
        selected.add(body.getElementsByAttributeValue("alt", htmlNode.alt));
    }
    if (htmlNode.classes != null && htmlNode.classes.length > 0) {
        Map<Element, Integer> found = new HashMap<Element, Integer>();
        for (int i = 0; i < htmlNode.classes.length; i++) {
            Elements elements = body.getElementsByClass(htmlNode.classes[i]);
            for (Element element : elements) {
                if (!found.containsKey(element)) {
                    found.put(element, 0);
                }
                found.put(element, found.get(element) + 1);
            }
        }
        Elements elements = new Elements();
        for (int i = htmlNode.classes.length; i > 0; i--) {
            for (Map.Entry<Element, Integer> entry : found.entrySet()) {
                if (entry.getValue() == i) {
                    elements.add(entry.getKey());
                }
            }
            if (!elements.isEmpty()) {
                break;
            }
        }
        selected.add(elements);
    }
    if (!CommonUtil.isEmpty(htmlNode.href)) {
        Elements hrefs = body.getElementsByAttribute("href");
        Elements toAdd = new Elements();
        String currentUrl = browser.getCurrentUrl();
        String hrefGiven = htmlNode.href;
        for (Element href : hrefs) {
            String hrefFound = href.attr("href");
            if (hrefGiven.equalsIgnoreCase(hrefFound)) {
                toAdd.add(href);
                toAdd.add(href);
                toAdd.add(href);
            } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.endsWith(hrefGiven)) {
                toAdd.add(href);
                toAdd.add(href);
            } else if (htmlNode.fuzzy && hrefFound != null && hrefFound.contains(hrefGiven)) {
                toAdd.add(href);
            } else {
                String uriGiven = UrlUtil.toCanonicalUri(currentUrl, hrefGiven);
                String uriFound = UrlUtil.toCanonicalUri(currentUrl, hrefFound);
                if (uriGiven.equalsIgnoreCase(uriFound)) {
                    toAdd.add(href);
                }
            }
        }
        selected.add(toAdd);
    }
    if (!CommonUtil.isEmpty(htmlNode.innerText)) {
        selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText)));
        selected.add(body.getElementsMatchingText("^\\s*" + Pattern.quote(htmlNode.innerText) + "\\s*$"));
    }
    if (htmlNode.multiple != null) {
        selected.add(body.getElementsByAttribute("multiple"));
    }
    Map<Element, Integer> votes = new HashMap<Element, Integer>();
    for (Elements elements : selected) {
        for (Element element : elements) {
            if (!votes.containsKey(element)) {
                votes.put(element, 0);
            }
            votes.put(element, votes.get(element) + 2);
            if (!NodeUtil.isHidden(element)) {
                votes.put(element, votes.get(element) + 1);
            }
        }
    }
    int maxVote = 0;
    Element maxElement = null;
    for (Map.Entry<Element, Integer> entry : votes.entrySet()) {
        if (entry.getValue() > maxVote) {
            maxVote = entry.getValue();
            maxElement = entry.getKey();
        }
    }
    return toElement(browser, maxElement, htmlNode, recurse);
}

From source file:com.screenslicer.core.util.Util.java

public static WebElement toElement(RemoteWebDriver driver, HtmlNode htmlNode, Element body)
        throws ActionFailed {
    if (body == null) {
        body = Util.openElement(driver, null, null, null);
    }//from   w  ww  .  j a  v  a 2s .  co m
    if (!CommonUtil.isEmpty(htmlNode.id)) {
        WebElement element = toElement(driver, body.getElementById(htmlNode.id));
        if (element != null) {
            return element;
        }
    }
    List<Elements> selected = new ArrayList<Elements>();
    if (!CommonUtil.isEmpty(htmlNode.tagName)) {
        selected.add(body.getElementsByTag(htmlNode.tagName));
    } else if (!CommonUtil.isEmpty(htmlNode.href)) {
        selected.add(body.getElementsByTag("a"));
    }
    if (!CommonUtil.isEmpty(htmlNode.name)) {
        selected.add(body.getElementsByAttributeValue("name", htmlNode.name));
    }
    if (!CommonUtil.isEmpty(htmlNode.type)) {
        selected.add(body.getElementsByAttributeValue("type", htmlNode.type));
    }
    if (!CommonUtil.isEmpty(htmlNode.value)) {
        selected.add(body.getElementsByAttributeValue("value", htmlNode.value));
    }
    if (!CommonUtil.isEmpty(htmlNode.title)) {
        selected.add(body.getElementsByAttributeValue("title", htmlNode.title));
    }
    if (htmlNode.classes != null && htmlNode.classes.length > 0) {
        Map<Element, Integer> found = new HashMap<Element, Integer>();
        for (int i = 0; i < htmlNode.classes.length; i++) {
            Elements elements = body.getElementsByClass(htmlNode.classes[i]);
            for (Element element : elements) {
                if (!found.containsKey(element)) {
                    found.put(element, 0);
                }
                found.put(element, found.get(element) + 1);
            }
        }
        Elements elements = new Elements();
        for (int i = htmlNode.classes.length; i > 0; i--) {
            for (Map.Entry<Element, Integer> entry : found.entrySet()) {
                if (entry.getValue() == i) {
                    elements.add(entry.getKey());
                }
            }
            if (!elements.isEmpty()) {
                break;
            }
        }
        selected.add(elements);
    }
    if (!CommonUtil.isEmpty(htmlNode.href)) {
        Elements hrefs = body.getElementsByAttribute("href");
        Elements toAdd = new Elements();
        String currentUrl = driver.getCurrentUrl();
        String hrefGiven = htmlNode.href;
        for (Element href : hrefs) {
            String hrefFound = href.attr("href");
            if (hrefGiven.equalsIgnoreCase(hrefFound)) {
                toAdd.add(href);
            } else {
                String uriGiven = Util.toCanonicalUri(currentUrl, hrefGiven);
                String uriFound = Util.toCanonicalUri(currentUrl, hrefFound);
                if (uriGiven.equalsIgnoreCase(uriFound)) {
                    toAdd.add(href);
                }
            }
        }
        selected.add(toAdd);
    }
    if (!CommonUtil.isEmpty(htmlNode.innerText)) {
        selected.add(body.getElementsMatchingText(Pattern.quote(htmlNode.innerText)));
    }
    if (htmlNode.multiple != null) {
        selected.add(body.getElementsByAttribute("multiple"));
    }
    Map<Element, Integer> votes = new HashMap<Element, Integer>();
    for (Elements elements : selected) {
        for (Element element : elements) {
            if (!Util.isHidden(element)) {
                if (!votes.containsKey(element)) {
                    votes.put(element, 0);
                }
                votes.put(element, votes.get(element) + 1);
            }
        }
    }
    int maxVote = 0;
    Element maxElement = null;
    for (Map.Entry<Element, Integer> entry : votes.entrySet()) {
        if (entry.getValue() > maxVote) {
            maxVote = entry.getValue();
            maxElement = entry.getKey();
        }
    }
    return toElement(driver, maxElement);
}

From source file:org.asqatasun.processor.DOMHandlerImpl.java

/**
 * This method should be called at each first selection of a RuleImplementation
 * to reset all the local collections./*from  w  w  w  .  j  ava2  s. c o m*/
 * 
 * @return the current instance 
 */
@Override
public DOMHandler beginCssLikeSelection() {
    // reset the local collection of elements
    selectedElements = new Elements();
    return this;
}

From source file:org.asqatasun.processor.DOMHandlerImpl.java

@Override
public int getTotalNumberOfElements() {
    if (totalNumberOfElement == -1) {
        totalNumberOfElement = cssLikeSelectNodeSet("*").getSelectedElementNumber();
        selectedElements = new Elements();
    }/*from   www  . jav  a2  s.c o  m*/
    return totalNumberOfElement;
}

From source file:org.asqatasun.rules.elementchecker.text.TextEmptinessCheckerTest.java

@Override
protected void setUp() throws Exception {
    super.setUp();
    mockTextElementBuilder = createMock(TextElementBuilder.class);
    mockSSPHandler = createMock(SSPHandler.class);
    elements = new Elements();
    element = new Element(Tag.valueOf("div"), "");
    mockTestSolutionHandler = createMock(TestSolutionHandler.class);
    mockProcessRemarkService = createMock(ProcessRemarkService.class);
}

From source file:org.asqatasun.rules.elementchecker.text.TextNotIdenticalToAttributeCheckerTest.java

@Override
protected void setUp() throws Exception {
    super.setUp();
    mockTextElementBuilder = createMock(TextElementBuilder.class);
    mockSSPHandler = createMock(SSPHandler.class);
    elements = new Elements();
    element = new Element(Tag.valueOf("div"), "");
    element.attr(AttributeStore.ALT_ATTR, "test");
    mockTestSolutionHandler = createMock(TestSolutionHandler.class);
    mockProcessRemarkService = createMock(ProcessRemarkService.class);
}