Example usage for org.jsoup.nodes Element absUrl

List of usage examples for org.jsoup.nodes Element absUrl

Introduction

In this page you can find the example usage for org.jsoup.nodes Element absUrl.

Prototype

public String absUrl(String attributeKey) 

Source Link

Document

Get an absolute URL from a URL attribute that may be relative (i.e.

Usage

From source file:org.apache.marmotta.ldclient.provider.phpbb.mapping.PHPBBTopicHrefMapper.java

/**
 * Take the selected value, process it according to the mapping definition, and create Sesame Values using the
 * factory passed as argument.//from  ww w .  ja va2  s . com
 *
 * @param resourceUri
 * @param selectedValue
 * @param factory
 * @return
 */
@Override
public List<Value> map(String resourceUri, Element selectedValue, ValueFactory factory) {
    String baseUriSite = resourceUri.substring(0, resourceUri.lastIndexOf('/'));
    String baseUriTopic = baseUriSite + "/viewtopic.php?";

    try {
        URI uri = new URI(selectedValue.absUrl("href"));
        Map<String, String> params = new HashMap<String, String>();
        for (NameValuePair p : URLEncodedUtils.parse(uri, "UTF-8")) {
            params.put(p.getName(), p.getValue());
        }

        return Collections.singletonList((Value) factory.createURI(baseUriTopic + "t=" + params.get("t")));
    } catch (URISyntaxException ex) {
        throw new RuntimeException("invalid syntax for URI", ex);
    }
}

From source file:eu.sisob.uma.extractors.adhoc.cvfilesinside.InternalCVFilesExtractor.java

/**
 *
 * @param input_file//from   ww  w .j  a va2 s  .co m
 * @param data_dir
 * @param output_file
 * @param error_sw
 */
public static void extract_cv_files(File input_file, File data_dir,
        File output_file/*, File output_file_2, File results_dir,*/, StringWriter error_sw) {
    CSVReader reader = null;
    try {
        reader = new CSVReader(new FileReader(input_file), CSV_SEPARATOR);
    } catch (FileNotFoundException ex) {
        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
    }

    int idStaffIdentifier = -1;
    int idName = -1;
    int idFirstName = -1;
    int idLastName = -1;
    int idInitials = -1;
    int idUnitOfAssessment_Description = -1;
    int idInstitutionName = -1;
    int idWebAddress = -1;
    int idResearchGroupDescription = -1;
    int idResearcherWebAddress = -1;
    int idResearcherWebAddressType = -1;
    int idResearcherWebAddressExt = -1;
    int idScoreUrl = -1;
    int idEmail = -1;
    int idScoreEmail = -1;

    String[] nextLine;
    try {
        if ((nextLine = reader.readNext()) != null) {
            //Locate indexes            
            //Locate indexes                        
            for (int i = 0; i < nextLine.length; i++) {
                String column_name = nextLine[i];
                if (column_name.equals(FileFormatConversor.CSV_COL_ID))
                    idStaffIdentifier = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_NAME))
                    idName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_FIRSTNAME))
                    idFirstName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_LASTNAME))
                    idLastName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INITIALS))
                    idInitials = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SUBJECT))
                    idUnitOfAssessment_Description = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_NAME))
                    idInstitutionName = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_INSTITUTION_URL))
                    idWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL))
                    idResearcherWebAddress = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE))
                    idResearcherWebAddressType = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT))
                    idResearcherWebAddressExt = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_URL))
                    idScoreUrl = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_EMAIL))
                    idEmail = i;
                else if (column_name.equals(FileFormatConversor.CSV_COL_SCORE_EMAIL))
                    idScoreEmail = i;
            }
        }
    } catch (Exception ex) {
        String error_msg = "Error reading headers of " + input_file.getName();
        Logger.getRootLogger().error(error_msg + " - " + ex.toString());
        if (error_sw != null)
            error_sw.append(error_msg + "\r\n");

        return;
    }

    if (idResearcherWebAddress != -1 && idResearcherWebAddressType != -1 && idResearcherWebAddressExt != -1
            && idStaffIdentifier != -1 && idLastName != -1 && idInitials != -1) {
        if (true) {
            try {
                String header = "";
                header += "\"" + FileFormatConversor.CSV_COL_ID + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_LASTNAME + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_INITIALS + "\"" + CSV_SEPARATOR;
                if (idFirstName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_FIRSTNAME + "\"" + CSV_SEPARATOR;
                if (idName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_NAME + "\"" + CSV_SEPARATOR;
                if (idEmail != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_EMAIL + "\"" + CSV_SEPARATOR;
                if (idInstitutionName != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_NAME + "\"" + CSV_SEPARATOR;
                if (idWebAddress != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_INSTITUTION_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_URL + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_EXT + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_RESEARCHER_PAGE_TYPE + "\"" + CSV_SEPARATOR;
                header += "\"" + FileFormatConversor.CSV_COL_SCORE_URL + "\"" + CSV_SEPARATOR;
                if (idScoreEmail != -1)
                    header += "\"" + FileFormatConversor.CSV_COL_SCORE_EMAIL + "\"" + CSV_SEPARATOR;
                header += "\r\n";
                FileUtils.write(output_file, header, "UTF-8", false);
                // DOWNLOAD HERE THE HOME PAGE 
                //FileUtils.write(output_file_2, header, "UTF-8", false);

            } catch (IOException ex) {
                Logger.getLogger("root").error(ex.toString());
                error_sw.append("Error creating output files\r\n");
            }
        }

        try {
            //                DOWNLOAD HERE THE HOME PAGE 
            //                if(!results_dir.exists())
            //                    results_dir.mkdirs();                
            //                File homepage_results_dirs = new File(results_dir, "HOMEPAGE");
            //                if(!homepage_results_dirs.exists())
            //                    homepage_results_dirs.mkdirs();
            //if(!test_only_output)
            {
                Pattern p1 = Pattern.compile("([a-zA-Z0-9#._-]+)+");

                while ((nextLine = reader.readNext()) != null) {
                    nextLine[idLastName] = nextLine[idLastName].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    nextLine[idInitials] = nextLine[idInitials].replaceAll("[^a-zA-Z]", " ").toLowerCase();
                    if (idFirstName != -1)
                        nextLine[idFirstName] = nextLine[idFirstName].replaceAll("[^a-zA-Z]", " ")
                                .toLowerCase();
                    if (idName != -1)
                        nextLine[idName] = nextLine[idName].replaceAll("[^a-zA-Z]", " ").toLowerCase();

                    Document content = null;
                    String researcher_page_url = nextLine[idResearcherWebAddress];
                    File temp_file = null;
                    if (p1.matcher(researcher_page_url).matches()) {

                    } else {

                        try {

                            Logger.getRootLogger().info("Reading " + researcher_page_url);

                            temp_file = File.createTempFile("internal-cv-files-", ".tmp");
                            URL fetched_url = Downloader.fetchURL(researcher_page_url);
                            FileUtils.copyURLToFile(fetched_url, temp_file);
                            long sizeInBytes = temp_file.length();
                            long sizeInMb = sizeInBytes / (1024 * 1024);
                            if (sizeInMb > 100) {
                                content = null;
                            } else {
                                String text_content = FileUtils.readFileToString(temp_file);
                                String check_string = "";
                                if (text_content.length() <= 100) {
                                    check_string = text_content.substring(0, text_content.length());
                                } else {
                                    check_string = text_content.substring(0, 100);
                                }
                                if (check_string.toLowerCase().contains("html")) {
                                    content = Jsoup.parse(text_content);
                                    content.setBaseUri(researcher_page_url);
                                    //                                          DOWNLOAD HERE THE HOME PAGE                                        
                                    //                                        String filename = nextLine[idStaffIdentifier] + "_HOMEPAGE_" + MD5(researcher_page_url) + ".html";
                                    //                                        FileUtils.copyFile(temp_file, new File(homepage_results_dirs, filename));                                        
                                    //                                        
                                    //                                        String result = "";                        
                                    //                                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;                                    
                                    //                                        if(idFirstName != -1) result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;  
                                    //                                        if(idName != -1) result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;  
                                    //                                        if(idEmail != -1) result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR; 
                                    //                                        if(idInstitutionName != -1) result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;  
                                    //                                        if(idWebAddress != -1) result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;                 
                                    //                                        result += "\"" + filename + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idResearcherWebAddressType] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                                    //                                        result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR;
                                    //                                        if(idScoreEmail != -1) result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR; 
                                    //                                        result += "\r\n";
                                    //
                                    //                                        try {
                                    //                                            FileUtils.write(output_file_2, result, "UTF-8", true);
                                    //                                        } catch (IOException ex) {
                                    //                                            Logger.getLogger("root").error(ex.toString());
                                    //                                        }
                                } else {
                                    throw new Exception(researcher_page_url + " is not html document");
                                }
                            }

                        } catch (Exception ex) {
                            Logger.getLogger("root").error("" + researcher_page_url + " could not loaded", ex);
                            error_sw.append("" + researcher_page_url + " could not loaded");
                            content = null;
                        } catch (java.lang.OutOfMemoryError ex2) {
                            Logger.getLogger("root")
                                    .error("" + researcher_page_url + " could not loaded (out of memory)", ex2);
                            error_sw.append("" + researcher_page_url + " could not loaded (out of memory)");
                            content = null;
                        } finally {
                            if (temp_file != null)
                                temp_file.delete();
                        }

                    }
                    //Add sources to output
                    {
                        String result = "";
                        result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                        if (idFirstName != -1)
                            result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                        if (idName != -1)
                            result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                        if (idEmail != -1)
                            result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR;
                        if (idInstitutionName != -1)
                            result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                        if (idWebAddress != -1)
                            result += "\"" + nextLine[idWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddress] + "\"" + CSV_SEPARATOR;
                        result += "\"" + nextLine[idResearcherWebAddressExt] + "\"" + CSV_SEPARATOR;
                        result += "\"HOMEPAGE\"" + CSV_SEPARATOR;
                        result += "\"" + (idScoreUrl != -1 ? nextLine[idScoreUrl] : "") + "\"" + CSV_SEPARATOR;
                        if (idScoreEmail != -1)
                            result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR;
                        result += "\r\n";

                        try {
                            FileUtils.write(output_file, result, "UTF-8", true);
                        } catch (IOException ex) {
                            Logger.getLogger("root").error(ex.toString());
                        }
                    }

                    if (content != null) {

                        Elements links = content.select("a[href]");
                        Elements links_worepeat = new Elements();

                        for (Element link : links) {

                            boolean b = false;
                            for (Element link_worepeat : links_worepeat) {
                                if (link.absUrl("href").equals(link_worepeat.absUrl("href"))) {
                                    b = true;
                                    break;
                                }
                            }

                            if (!b)
                                links_worepeat.add(link);

                        }

                        for (Element link : links_worepeat) {

                            boolean b = false;
                            link.setBaseUri(researcher_page_url);
                            String clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                            for (String k : cv_keywords_in_name_list) {
                                if (clean_name_1.contains(k)) {
                                    b = true;
                                    break;
                                }
                            }
                            if (b) {
                                Logger.getRootLogger()
                                        .info("CV found " + link.absUrl("href") + " (" + link.text() + ")");
                                String href = link.absUrl("href");

                                String ext = "";
                                String score = "";
                                String type = "CV";

                                if (link.absUrl("href").endsWith(".pdf"))
                                    ext = "PDF";
                                else if (link.absUrl("href").endsWith(".doc"))
                                    ext = "DOC";
                                else if (link.absUrl("href").endsWith(".docx"))
                                    ext = "DOCX";
                                else if (link.absUrl("href").endsWith(".rtf"))
                                    ext = "RTF";
                                else if (link.absUrl("href").endsWith(".txt"))
                                    ext = "TXT";
                                else
                                    ext = "HTML";

                                if (ext.equals("HTML")) {
                                    score = "B";
                                } else {
                                    score = "A";
                                }

                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                if (idEmail != -1)
                                    result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + ext + "\"" + CSV_SEPARATOR;
                                result += "\"" + type + "\"" + CSV_SEPARATOR;
                                result += "\"" + score + "\"" + CSV_SEPARATOR;
                                if (idScoreEmail != -1)
                                    result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR;
                                result += "\r\n";

                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }

                            }

                            b = false;
                            link.setBaseUri(researcher_page_url);
                            clean_name_1 = link.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                            for (String k : pub_keywords_in_name_list) {
                                if (clean_name_1.contains(k)) {
                                    b = true;
                                    break;
                                }
                            }
                            if (b) {
                                Logger.getRootLogger()
                                        .info("PUB found " + link.absUrl("href") + " (" + link.text() + ")");
                                String href = link.absUrl("href");

                                String ext = "";
                                String score = "";
                                String type = "PUB";

                                if (link.absUrl("href").endsWith(".pdf"))
                                    ext = "PDF";
                                else if (link.absUrl("href").endsWith(".doc"))
                                    ext = "DOC";
                                else if (link.absUrl("href").endsWith(".docx"))
                                    ext = "DOCX";
                                else if (link.absUrl("href").endsWith(".rtf"))
                                    ext = "RTF";
                                else if (link.absUrl("href").endsWith(".txt"))
                                    ext = "TXT";
                                else
                                    ext = "HTML";

                                if (ext.equals("HTML")) {
                                    score = "-";
                                } else {
                                    score = "-";
                                }

                                String result = "";
                                result += "\"" + nextLine[idStaffIdentifier] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idLastName] + "\"" + CSV_SEPARATOR;
                                result += "\"" + nextLine[idInitials] + "\"" + CSV_SEPARATOR;
                                if (idFirstName != -1)
                                    result += "\"" + nextLine[idFirstName] + "\"" + CSV_SEPARATOR;
                                if (idName != -1)
                                    result += "\"" + nextLine[idName] + "\"" + CSV_SEPARATOR;
                                if (idEmail != -1)
                                    result += "\"" + nextLine[idEmail] + "\"" + CSV_SEPARATOR;
                                if (idInstitutionName != -1)
                                    result += "\"" + nextLine[idInstitutionName] + "\"" + CSV_SEPARATOR;
                                if (idWebAddress != -1)
                                    result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + href + "\"" + CSV_SEPARATOR;
                                result += "\"" + ext + "\"" + CSV_SEPARATOR;
                                result += "\"" + type + "\"" + CSV_SEPARATOR;
                                result += "\"" + score + "\"" + CSV_SEPARATOR;
                                if (idScoreEmail != -1)
                                    result += "\"" + nextLine[idScoreEmail] + "\"" + CSV_SEPARATOR;
                                result += "\r\n";

                                try {
                                    FileUtils.write(output_file, result, "UTF-8", true);
                                } catch (IOException ex) {
                                    Logger.getLogger("root").error(ex.toString());
                                }

                            }
                        }

                    }
                }

                reader.close();

            }

            //                    reader = null;
            //                    try {
            //                        reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR);
            //                    } catch (FileNotFoundException ex) {
            //                        Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
            //                    }
            //
            //                    reader.readNext();
            //
            //                    int newIdResearcherWebpage = 3;
            //                    if(idFirstName != -1) newIdResearcherWebpage++; 
            //                    if(idName != -1) newIdResearcherWebpage++; 
            //                    if(idEmail != -1) newIdResearcherWebpage++; 
            //                    if(idInstitutionName != -1) newIdResearcherWebpage++; 
            //                    if(idWebAddress != -1) newIdResearcherWebpage++; 
            //
            //                    List<Object[]> urls_times = new ArrayList<Object[]>();
            //                    while ((nextLine = reader.readNext()) != null) 
            //                    {
            //                        String url = nextLine[newIdResearcherWebpage];
            //
            //                        Object[] url_time = new Object[2];
            //                        url_time[0] = url;
            //                        boolean b = false;
            //                        for(Object[] u : urls_times){
            //                            if(u[0].equals(url_time[0])){
            //                                u[1] = (Integer)u[1] + 1;         
            //                                b = true;
            //                                break;
            //                            }
            //                        }
            //
            //                        if(!b){
            //                            url_time[1] = new Integer(1);
            //                            urls_times.add(url_time);
            //                        }
            //                    }            
            //
            //                    reader.close();                    

            //                try {
            //                    reader = new CSVReader(new FileReader(output_file), CSV_SEPARATOR);
            //                } catch (FileNotFoundException ex) {
            //                    Logger.getRootLogger().error("Error reading " + input_file.getName() + " - " + ex.toString());
            //                }
            //
            //                nextLine = reader.readNext();
            //                try {
            //                    for(int i = 0; i < nextLine.length; i++)
            //                        nextLine[i] = "\"" + nextLine[i] + "\"";
            //                    FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", false);
            //                } catch (IOException ex) {
            //                    Logger.getLogger("root").error(ex.toString());
            //                }
            //                
            //                while ((nextLine = reader.readNext()) != null) 
            //                {
            //                    String url = nextLine[newIdResearcherWebpage];
            //                    boolean b = false;
            //                    for(Object[] u : urls_times){
            //                        if(u[0].equals(url) && ((Integer)u[1] == 1)){                                
            //                            b = true;
            //                            break;
            //                        }
            //                    }
            //                    
            //                    if(b){
            //                        try {
            //                            for(int i = 0; i < nextLine.length; i++)
            //                                nextLine[i] = "\"" + nextLine[i] + "\"";
            //                            FileUtils.write(output_file, StringUtil.join(Arrays.asList(nextLine), ";") + "\r\n", "UTF-8", true);
            //                        } catch (IOException ex) {
            //                            Logger.getLogger("root").error(ex.toString());
            //                        }
            //                    }
            //                }
            //                
            //                 reader.close();  

        } catch (Exception ex) {
            String error_msg = "Error extracting cv files from extractor " + input_file.getName();
            Logger.getRootLogger().error(error_msg + " - " + ex.toString());
            if (error_sw != null)
                error_sw.append(error_msg + "\r\n");
            return;
        }
    }
}

From source file:org.sbs.goodcrawler.fetcher.FetchWorker.java

/**
 * @param url/*from   w ww .j  a va 2  s  .c o m*/
 * @desc 
 */
public void fetchPage(WebURL url) {
    PageFetchResult result = null;
    try {
        if (null != url && StringUtils.isNotBlank(url.getURL())) {
            // ??
            if (fetchFilter(url.getURL())) {
                result = fetcher.fetchHeader(url);
                // ??
                int statusCode = result.getStatusCode();
                if (statusCode == CustomFetchStatus.PageTooBig) {
                    onIgnored(url);
                    return;
                }
                if (statusCode != HttpStatus.SC_OK) {
                    onFailed(url);
                } else {
                    Page page = new Page(url);
                    pendingUrls.processedSuccess();
                    if (!result.fetchContent(page)) {
                        onFailed(url);
                        return;
                    }
                    if (!parser.parse(page, url.getURL())) {
                        onFailed(url);
                        return;
                    }
                    // ??
                    if (extractFilter(url.getURL())) {
                        pendingPages.addElement(page);
                    }

                    // depth
                    if (url.getDepth() > conf.getMaxDepthOfCrawling() && conf.getMaxDepthOfCrawling() != -1) {
                        return;
                    }
                    // ???Url?Url
                    Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                            urlUtils.getBaseUrl(page.getWebURL().getURL()));
                    Elements links = doc.getElementsByTag("a");
                    if (!links.isEmpty()) {
                        for (Element link : links) {
                            String linkHref = link.absUrl("href");
                            // ??
                            if (fetchFilter(linkHref) && !bloomfilterHelper.exist(linkHref)) {
                                WebURL purl = new WebURL();
                                purl.setURL(linkHref);
                                purl.setJobName(conf.jobName);
                                purl.setDepth((short) (url.getDepth() + 1));
                                if (purl.getDepth() > conf.getMaxDepthOfCrawling()
                                        && conf.getMaxDepthOfCrawling() != -1)
                                    return;
                                try {
                                    if (!pendingUrls.addElement(purl, 1000)) {
                                        FileUtils.writeStringToFile(new File("status/_urls.good"),
                                                url.getURL() + "\n", true);
                                    }
                                } catch (QueueException e) {
                                    log.error(e.getMessage());
                                }
                            }
                        }
                    }
                }
            } else {
                onIgnored(url);
            }
        }
    } catch (Exception e) {
        onFailed(url);
    } catch (QueueException e) {
        onFailed(url);
    } finally {
        if (null != result)
            result.discardContentIfNotConsumed();
    }
}

From source file:org.aliuge.crawler.fetcher.FetchWorker.java

/**
 * @param url/*from   ww  w . j av a 2s  . c o  m*/
 * @desc 
 */
public void fetchPage(WebURL url) {

    PageFetchResult result = null;
    try {
        if (null != url && StringUtils.isNotBlank(url.getUrl())) {

            result = fetcher.fetch(url, true);
            // ??
            int statusCode = result.getStatusCode();
            if (statusCode == CustomFetchStatus.PageTooBig) {
                onIgnored(url);
                return;
            }
            if (statusCode != HttpStatus.SC_OK) {
                onFailed(url);
            } else {
                Page page = new Page(url);
                onSuccessed();
                if (!result.fetchContent(page)) {
                    onFailed(url);
                    return;
                }
                if (!parser.parse(page, url.getUrl())) {
                    onFailed(url);
                    return;
                }
                // ??
                String e_url = extractFilterAndChangeUrl(url.getUrl());
                if (StringUtils.isNoneBlank(e_url)) {
                    url.setUrl(e_url);
                    page.setWebURL(url);
                    pendingPages.addElement(page);
                    return;
                }

                // depth
                if (url.getDepth() > config.getMaxDepthOfCrawling() && config.getMaxDepthOfCrawling() != -1) {
                    return;
                }
                // ???Url?Url
                Document doc = Jsoup.parse(new String(page.getContentData(), page.getContentCharset()),
                        urlUtils.getBaseUrl(page.getWebURL().getUrl()));
                Elements links = doc.getElementsByTag("a");
                if (!links.isEmpty()) {
                    for (Element link : links) {
                        String linkHref = link.absUrl("href");

                        // ???url
                        if ((fetchFilter(linkHref) || extractFilter(linkHref))
                                && !bloomfilterHelper.exist(linkHref)) {
                            WebURL purl = new WebURL();
                            purl.setName(link.text());
                            purl.setUrl(linkHref);

                            purl.setDepth((short) (url.getDepth() + 1));
                            if (purl.getDepth() > config.getMaxDepthOfCrawling()
                                    && config.getMaxDepthOfCrawling() != -1)
                                return;
                            try {
                                if (!pendingUrls.addElement(purl)) {
                                    FileUtils.writeStringToFile(new File("status/_urls.good"),
                                            url.getUrl() + "\n", true);
                                }
                            } catch (QueueException e) {
                                log.error(e.getMessage());
                            }
                        }
                    }
                }
            }

        }
    } catch (QueueException e) {
        onFailed(url);
    } catch (Exception e) {
        e.printStackTrace();
        onFailed(url);
    } finally {
        if (null != result)
            result.discardContentIfNotConsumed();
    }
}

From source file:me.vertretungsplan.parser.UntisMonitorParser.java

private void loadUrl(String url, String encoding, boolean following, List<Document> docs, String startUrl,
        int recursionDepth) throws IOException, CredentialInvalidException {
    String html;/*w w  w .  j  av a 2s. co m*/
    if (url.equals(VALUE_URL_LOGIN_RESPONSE)) {
        html = loginResponse;
    } else {
        try {
            html = httpGet(url, encoding).replace("&nbsp;", "");
        } catch (HttpResponseException e) {
            if (docs.size() == 0) {
                throw e;
            } else {
                return; // ignore if first page was loaded and redirect didn't work
            }
        }
    }
    Document doc = Jsoup.parse(html);
    doc.setBaseUri(url);

    if (doc.select(".mon_title").size() == 0) {
        // We have a problem - there seems to be no substitution schedule. Maybe it is hiding
        // inside a frame?
        if (doc.select("frameset frame[name").size() > 0) {
            for (Element frame : doc.select("frameset frame")) {
                if (frame.attr("src").matches(".*subst_\\d\\d\\d.html?")
                        && recursionDepth < MAX_RECURSION_DEPTH) {
                    String frameUrl = frame.absUrl("src");
                    loadUrl(frame.absUrl("src"), encoding, following, docs, frameUrl, recursionDepth + 1);
                }
            }
        } else if (doc.text().contains("registriert")) {
            throw new CredentialInvalidException();
        } else {
            if (docs.size() == 0) {
                // ignore if first page was loaded and redirect didn't work
                throw new IOException(
                        "Could not find .mon-title, seems like there is no Untis " + "schedule here");
            }
        }
    } else {
        findSubDocs(docs, html, doc);

        if (following && doc.select("meta[http-equiv=refresh]").size() > 0) {
            Element meta = doc.select("meta[http-equiv=refresh]").first();
            String attr = meta.attr("content").toLowerCase();
            String redirectUrl = url.substring(0, url.lastIndexOf("/") + 1)
                    + attr.substring(attr.indexOf("url=") + 4);
            if (!redirectUrl.equals(startUrl) && recursionDepth < MAX_RECURSION_DEPTH) {
                loadUrl(redirectUrl, encoding, true, docs, startUrl, recursionDepth + 1);
            }
        }
    }
}

From source file:ac.simons.oembed.Oembed.java

/**
 * Parses  the given html document into a document and processes 
 * all anchor elements. If a valid anchor is found, it tries to
 * get an oembed response for it's url and than render the result
 * into the document replacing the given anchor.<br>
 * It returns the html representation of the new document.<br>
 * If there's an error or no oembed result for an url, the anchor tag
 * will be left as it was. /*from  w  ww  .j  av  a  2  s  . co  m*/
 * @param document The document that should be checked for links to transform
 * @return the transformed document
 */
public Document transformDocument(final Document document) {
    boolean changedBaseUri = false;
    if (document.baseUri() == null && this.getBaseUri() != null) {
        document.setBaseUri(this.getBaseUri());
        changedBaseUri = true;
    }
    for (Element a : document.getElementsByTag("a")) {
        final String href = a.absUrl("href");
        try {
            String renderedRespose = null;
            final OembedResponse oembedResponse = this.transformUrl(href);
            // There was no response or an exception happened
            if (oembedResponse == null)
                continue;
            // There is a handler for this response
            else if (this.getHandler().containsKey(oembedResponse.getSource()))
                this.getHandler().get(oembedResponse.getSource()).handle(document, a, oembedResponse);
            // Try to render the response itself and replace the current anchor
            else if ((renderedRespose = oembedResponse.render()) != null) {
                a.before(renderedRespose);
                a.remove();
            }
        } catch (OembedException e) {
            logger.warn(String.format("Skipping '%s': %s", href, e.getMessage()));
        }
    }
    if (changedBaseUri)
        document.setBaseUri(null);
    return document;
}

From source file:eu.sisob.uma.extractors.adhoc.websearchers.WebSearchersExtractor.java

/**
 *
 * @param nextLine/*from  w  ww  . j  ava2 s. com*/
 * @param idStaffIdentifier
 * @param idName
 * @param idFirstName
 * @param idLastName
 * @param idInitials
 * @param idSubject
 * @param idInstitutionName
 * @param idWebAddress
 * @param expression
 * @param params
 * @return
 */
@Override
protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
        int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
        String expression, Object[] params) {

    String keywords = " (PROFILE OR PHD OR RESEARCHER OR FACULTY OR PROFESSOR OR RESEARCH) AND ";
    keywords = "";

    String domain = clean_site(nextLine[idWebAddress]);
    String subject = nextLine[idSubject];
    String and_institution_name = (idInstitutionName != -1 ? " AND " + nextLine[idInstitutionName] : "");
    String expression_subject = expression + " AND " + subject;
    String expression_site = expression + " site: " + domain;
    String expression_inst_name = expression + and_institution_name;
    String expression_inst_name_and_subject = expression + and_institution_name + " AND " + subject;

    String url = "";

    switch (search_patterns) {
    case P1:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression;
        break;
    case P2:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
        break;
    case P3:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_site;
        break;
    case P4:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name;
        break;
    case P5:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_inst_name_and_subject;
        break;
    default:
        url = "https://duckduckgo.com/html/?q=" + keywords + expression_subject;
        break;
    }
    Logger.getRootLogger().info("Go with " + url);
    boolean again = false;
    Document doc = null;
    do {
        doc = getDocumentFromPage(url, 10, 1000, 5000);

        if (doc != null && doc.text().contains("If this error persists, please let us know")) {
            try {
                Thread.sleep(30000);
            } catch (InterruptedException ex) {
            }
            again = true;
        } else {
            again = false;
        }
    } while (again);

    String final_result = "";
    if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

        /* Write resercher founded */
        Elements elements = doc.select("div[class*=links_main] > a");

        /* We will take the first html page and the first pdf */

        HashMap<String, String> results = new HashMap<String, String>();

        int max_results = 2;
        int i_result = 0;
        for (Element e : elements) {
            if ((e.text().startsWith("[")
            //&& !e.text().startsWith("[PDF]")
            ) || e.absUrl("href").contains("duckduckgo.com/y.js") || e.absUrl("href").contains("wikipedia.")
                    || e.absUrl("href").contains("facebook.com") || e.absUrl("href").contains("microsoft.com")
                    || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                    || e.absUrl("href").contains("www.biography.com")
                    || e.absUrl("href").contains("biomedexperts.com")
                    || e.absUrl("href").contains("www.experts.scival.com")
                    || e.absUrl("href").contains("ratemyprofessors.com")
                    || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                    || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                    || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                    || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                    || e.absUrl("href").contains("www.amazon")) {
                max_results++;
                continue;
            }

            boolean add = false;
            String score = "";
            String ext = "";
            if (!results.containsKey("HTML") && !e.text().startsWith("[")) {
                //results.put("html", )

                File temp;
                try {
                    temp = File.createTempFile("temp-file-name", ".tmp");
                    URL fetched_url = Downloader.fetchURL(e.absUrl("href"));
                    FileUtils.copyURLToFile(fetched_url, temp);
                    long sizeInBytes = temp.length();
                    long sizeInMb = sizeInBytes / (1024 * 1024);
                    if (sizeInMb > 100) {
                        score = "B";
                    } else {
                        String content = FileUtils.readFileToString(temp);
                        if (content.contains(nextLine[idLastName])) {
                            score = "A";
                        } else {
                            score = "B";
                        }
                    }
                } catch (IOException ex) {
                    score = "B";
                }

                ext = "HTML";
                add = true;
            }

            //if(!results.containsKey("PDF") && e.text().startsWith("[PDF]")){                                                        
            //    score = "A";
            //    ext = "PDF";
            //    add = true;
            //}                          

            if (add) {
                String result = "";
                result += "\"" + nextLine[idStaffIdentifier] + "\";";
                result += "\"" + nextLine[idLastName] + "\";";
                result += "\"" + nextLine[idInitials] + "\";";
                if (idFirstName != -1)
                    result += "\"" + nextLine[idFirstName] + "\";";
                if (idName != -1)
                    result += "\"" + nextLine[idName] + "\";";
                result += "\"" + e.absUrl("href") + "\";";
                result += "\"" + ext + "\";";
                result += "\"" + "CV" + "\";";
                result += "\"" + score + "\"";
                result += "\r\n";
                results.put(ext, result);

                Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + e.text());
            }

            //                if(results.containsKey("PDF") && results.containsKey("HTML")){
            //                    break;
            //                }

            i_result++;
            if (max_results <= i_result) {
                break;
            }
        }

        //            if(results.containsKey("PDF"))
        //                final_result = results.get("PDF");
        //            else 
        if (results.containsKey("HTML"))
            final_result = results.get("HTML");
        else
            final_result = "";
    }

    return final_result;
}

From source file:eu.sisob.uma.extractors.adhoc.websearchers_cv.WebSearchersCVExtractor.java

/**
 *
 * @param nextLine/*from   w  ww.  j  a v  a2 s.c  om*/
 * @param idStaffIdentifier
 * @param idName
 * @param idFirstName
 * @param idLastName
 * @param idInitials
 * @param idSubject
 * @param idInstitutionName
 * @param idWebAddress
 * @param expression
 * @param params
 * @return
 */
@Override
protected String get_result(String[] nextLine, int idStaffIdentifier, int idName, int idFirstName,
        int idLastName, int idInitials, int idSubject, int idInstitutionName, int idWebAddress,
        String expression, Object[] params) {

    String domain = clean_site(nextLine[idWebAddress]);
    String subject = nextLine[idSubject];
    String expression_subject = expression + " " + subject + " " + files + " " + cv_keywords_in_query;
    expression_subject = expression_subject.replaceAll("\t", " ");
    expression_subject = expression_subject.replaceAll("  ", " ");

    String url = "https://duckduckgo.com/html/?q=" + expression_subject;
    Logger.getRootLogger().info("Go with " + url);
    boolean again = false;
    Document doc = null;
    do {
        doc = getDocumentFromPage(url, 10, 2000, 5000);

        if (doc != null && doc.text().contains("If this error persists, please let us know")) {
            try {
                Thread.sleep(30000);
            } catch (InterruptedException ex) {
            }
            again = true;
        } else {
            again = false;
        }
    } while (again);

    //if(doc.select("div[class*=links_main] > a[href*=" + domain + "]").size() > 0){
    String final_result = "";
    if (doc != null && doc.select("div[class*=links_main] > a").size() > 0) {

        /* Write resercher founded */
        Elements elements = doc.select("div[class*=links_main] > a");

        /* We will take the first html page and the first pdf */

        List<String[]> results = new ArrayList<String[]>();
        final int EXT_I = 0;
        final int SCORE_INT_I = 1;
        final int SCORE_LETTER_I = 2;
        final int RESULT_I = 3;
        final int WORST_SCORE = 67;

        //int max_results = elements.size();
        //int i_result = 0; 
        for (Element e : elements) {
            if ((e.text().startsWith("[") && !e.text().startsWith("[PDF]"))
                    || e.absUrl("href").contains("duckduckgo.com/y.js")
                    || e.absUrl("href").contains("wikipedia.") || e.absUrl("href").contains("microsoft.com")
                    || e.absUrl("href").contains("google.com") || e.absUrl("href").contains("linkedin")
                    || e.absUrl("href").contains("www.biography.com")
                    || e.absUrl("href").contains("biomedexperts.com")
                    || e.absUrl("href").contains("www.experts.scival.com")
                    || e.absUrl("href").contains("ratemyprofessors.com")
                    || e.absUrl("href").contains("flickr.com") || e.absUrl("href").endsWith(".txt")
                    || e.absUrl("href").endsWith(".csv") || e.absUrl("href").endsWith(".xml")
                    || e.absUrl("href").endsWith(".doc") || e.absUrl("href").endsWith(".docx")
                    || e.absUrl("href").endsWith(".xls") || e.absUrl("href").endsWith(".xlxs")
                    || e.absUrl("href").contains("www.amazon")) {
                continue;
            }

            boolean add = false;
            int score_int = WORST_SCORE;
            String score = "";
            String ext = "";

            if (e.text().startsWith("[PDF]") || e.text().startsWith("[DOCX]") || e.text().startsWith("[DOC]")
                    || e.text().startsWith("[RTF]")) {

                String clean_name_1 = e.text().replaceAll("[^\\w\\s]", "").toLowerCase();
                int i = e.absUrl("href").lastIndexOf("/");
                int f = e.absUrl("href").lastIndexOf(".");
                String clean_name_2 = "";
                if (i != -1 && f != -1)
                    clean_name_2 = e.absUrl("href").substring(i, f).toLowerCase();
                boolean b = false;
                for (String k : cv_keywords_in_name_list) {
                    if (clean_name_1.contains(k) || clean_name_2.contains(k)) {
                        b = true;
                        break;
                    }
                }
                if (b) {
                    score_int--;
                }

                if (clean_name_1.contains(nextLine[idLastName])
                        || clean_name_2.contains(nextLine[idLastName])) {
                    score_int--;
                }

                score = Character.toChars(score_int)[0] + "";
                add = true;
                ext = "PDF";
            }

            //if(!results.containsKey("HTML") && !e.text().startsWith("[")){
            //}                                                 

            if (add) {
                String result = "";
                result += "\"" + nextLine[idStaffIdentifier] + "\";";
                result += "\"" + nextLine[idLastName] + "\";";
                result += "\"" + nextLine[idInitials] + "\";";
                if (idFirstName != -1)
                    result += "\"" + nextLine[idFirstName] + "\";";
                if (idName != -1)
                    result += "\"" + nextLine[idName] + "\";";
                result += "\"" + e.absUrl("href") + "\";";
                result += "\"" + ext + "\";";
                result += "\"" + "CV" + "\";";
                result += "\"" + score + "\"";
                result += "\r\n";
                results.add(new String[] { ext, score_int + "", score, result });

                Logger.getRootLogger().info("Select " + e.absUrl("href") + " - " + score + " - " + e.text());
            }
        }

        final_result = "";
        int best_score = WORST_SCORE;
        for (String[] result : results) {

            if (result[EXT_I].equals("PDF")) {
                int act_score = Integer.parseInt(result[SCORE_INT_I]);

                if (act_score < best_score) {
                    best_score = act_score;
                    final_result = result[RESULT_I];
                }

            }
        }
    }

    return final_result;
}

From source file:com.aquest.emailmarketing.web.controllers.BroadcastTemplateController.java

/**
 * Define content./*  w ww  . j av  a  2  s. c  o m*/
 *
 * @param model the model
 * @param broadcastTemplate1 the broadcast template1
 * @param result the result
 * @param principal the principal
 * @return the string
 * @throws IOException 
 */
@RequestMapping(value = "/defineBcastTemplateContent", method = RequestMethod.POST)
public String defineContent(Model model,
        @Valid @ModelAttribute("broadcastTemplate") BroadcastTemplate broadcastTemplate1,
        @RequestParam(value = "fromUrl", required = false) String fromUrl,
        @RequestParam(value = "optimize", required = false) boolean optimize,
        @RequestParam(value = "baseurl", required = false) String baseUrl,
        @RequestParam(value = "rel2abs", required = false) boolean rel2abs, BindingResult result,
        Principal principal) throws IOException {
    String htmlBodyPrep = "";
    BroadcastTemplate broadcastTemplate = broadcastTemplateService
            .getBroadcastTemplateById(broadcastTemplate1.getId());
    broadcastTemplate.setB_template_subject(broadcastTemplate1.getB_template_subject());
    if (fromUrl != "") {
        Document doc = Jsoup.connect(fromUrl).get();
        htmlBodyPrep = doc.outerHtml();
        broadcastTemplate.setHtmlbody(htmlBodyPrep);
        System.out.println(htmlBodyPrep);
    }
    if (broadcastTemplate1.getHtmlbody() != null) {
        htmlBodyPrep = broadcastTemplate1.getHtmlbody();
        broadcastTemplate.setHtmlbody(htmlBodyPrep);
    }
    if (rel2abs == true) {
        if (baseUrl != null) {
            System.out.println(baseUrl);
            Document doc = Jsoup.parse(broadcastTemplate.getHtmlbody(), baseUrl);
            System.out.println(doc.toString());

            Elements images = doc.select("img");
            for (Element e : images) {
                e.attr("src", e.absUrl("src"));
                System.out.println(e.absUrl("src"));
            }
            broadcastTemplate.setHtmlbody(doc.outerHtml());
            htmlBodyPrep = doc.outerHtml();
        } else {
            // ovde staviti error handling
        }
    }
    if (optimize == true) {
        //          /* PREMAILER API OPTIONS
        //           * line_length - Line length used by to_plain_text. Boolean, default is 65.
        //             warn_level - What level of CSS compatibility warnings to show (see Warnings).
        //                NONE = 0
        //                SAFE = 1
        //                POOR = 2
        //                RISKY = 3
        //             link_query_string - A string to append to every a href="" link. Do not include the initial ?.
        //             base_url - Used to calculate absolute URLs for local files.
        //             css - Manually specify CSS stylesheets.
        //             css_to_attributes - Copy related CSS attributes into HTML attributes (e.g. background-color to bgcolor)
        //             css_string - Pass CSS as a string
        //             remove_ids - Remove ID attributes whenever possible and convert IDs used as anchors to hashed to avoid collisions in webmail programs. Default is false.
        //             remove_classes - Remove class attributes. Default is false.
        //             remove_comments -  Remove html comments. Default is false.
        //             preserve_styles - Whether to preserve any link rel=stylesheet and style elements. Default is false.
        //             preserve_reset - Whether to preserve styles associated with the MailChimp reset code
        //             with_html_string -  Whether the html param should be treated as a raw string.
        //             verbose - Whether to print errors and warnings to $stderr. Default is false.
        //             adapter - Which HTML parser to use, either :nokogiri or :hpricot. Default is :hpricot.
        //          */
        Premailer premailer = new Premailer();
        PremailerInterface premailerInterface = premailer.getPremailerInstance();

        Map<String, Object> options = new HashMap<String, Object>();
        options.put("with_html_string", true);
        options.put("base_url", fromUrl);
        premailerInterface.init(broadcastTemplate.getHtmlbody(), options);
        //premailerInterface.init(htmlBodyPrep, options);
        broadcastTemplate.setHtmlbody(premailerInterface.inline_css());
        System.out.println(premailerInterface.inline_css());
        premailer.destroyInstance();
    }
    broadcastTemplate.setPlaintext(broadcastTemplate1.getPlaintext());
    System.out.println(broadcastTemplate.toString());
    String bcast_id = broadcastTemplateService.SaveOrUpdate(broadcastTemplate);
    // Find URLs in html body and add tracking code
    Urls urls = new Urls();
    String html = broadcastTemplate.getHtmlbody();
    List<String> urlList = new ArrayList<String>();
    Document doc = Jsoup.parse(html);
    Elements links = doc.select("a[href]");
    for (Element link : links) {
        if (link.attr("abs:href").length() > 5) {
            urlList.add(link.attr("abs:href"));
        }
    }
    model.addAttribute("urlList", urlList);
    model.addAttribute("urls", urls);

    // Google Analytics - utmCampaign List
    List<String> utmCampaignList = new ArrayList<String>();
    utmCampaignList.add("[BROADAST_NAME]");
    model.addAttribute("utmCampaignList", utmCampaignList);

    // Google Analytics - utmSource List
    List<String> utmSourceList = new ArrayList<String>();
    utmSourceList.add("[CAMPAIGN_NAME]");
    model.addAttribute("utmSourceList", utmSourceList);

    // Google Analytics - utmContent List
    List<String> utmContentList = new ArrayList<String>();
    utmContentList.add("[EMAIL]");
    // ovde dodati sve varijabilne podatke iz CM_EMAIL_BROADCAST_LIST
    model.addAttribute("utmContentList", utmContentList);

    model.addAttribute("broadcastTemplate", broadcastTemplate);
    return "bcasttemptracking";
}

From source file:ac.simons.oembed.Oembed.java

private OembedProvider autodiscoverOembedURIForUrl(final String url) {
    OembedProvider rv = null;/*from   w  w w  .ja  v a 2 s .  c o  m*/

    try {
        final HttpGet request = new HttpGet(url);
        final HttpResponse httpResponse = this.httpClient.execute(request);
        if (httpResponse.getStatusLine().getStatusCode() != HttpStatus.SC_OK)
            logger.warn(String.format("Autodiscovery for %s failed, server returned error %d: %s", url,
                    httpResponse.getStatusLine().getStatusCode(),
                    EntityUtils.toString(httpResponse.getEntity())));
        else {
            final URI uri = request.getURI();
            final Document document = Jsoup.parse(EntityUtils.toString(httpResponse.getEntity(), "UTF-8"),
                    String.format("%s://%s:%d", uri.getScheme(), uri.getHost(), uri.getPort()));
            for (Element alternate : document.getElementsByAttributeValue("rel", "alternate")) {
                if (alternate.attr("type").equalsIgnoreCase("application/json+oembed"))
                    rv = new AutodiscoveredOembedProvider(url, new URI(alternate.absUrl("href")), "json");
                else if (alternate.attr("type").equalsIgnoreCase("text/xml+oembed"))
                    rv = new AutodiscoveredOembedProvider(url, new URI(alternate.absUrl("href")), "xml");
                if (rv != null)
                    break;
            }
        }
    } catch (Exception e) {
        logger.warn(String.format("Autodiscovery for %s failedd: %s", url, e.getMessage()), e);
    }

    return rv;
}