Example usage for org.jsoup.nodes Element text

Introduction

In this page you can find the example usage for org.jsoup.nodes Element text.

Prototype

public String text()

Source Link

Document

Gets the combined text of this element and all its children.

Usage

From source file:de.geeksfactory.opacclient.apis.Zones22.java

@Override
public AccountData account(Account acc)
        throws IOException, NotReachableException, JSONException, SocketException, OpacErrorException {
    Document login = login(acc);/*from   w  w  w.j a va 2 s.c  om*/
    if (login == null)
        return null;

    AccountData res = new AccountData(acc.getId());

    String lent_link = null;
    String res_link = null;
    int lent_cnt = -1;
    int res_cnt = -1;
    for (Element td : login.select(
            ".AccountSummaryCounterNameCell, .AccountSummaryCounterNameCellStripe, .CAccountDetailFieldNameCellStripe, .CAccountDetailFieldNameCell")) {
        String section = td.text().trim();
        if (section.contains("Entliehene Medien")) {
            lent_link = td.select("a").attr("href");
            lent_cnt = Integer.parseInt(td.nextElementSibling().text().trim());
        } else if (section.contains("Vormerkungen")) {
            res_link = td.select("a").attr("href");
            res_cnt = Integer.parseInt(td.nextElementSibling().text().trim());
        } else if (section.contains("Kontostand")) {
            res.setPendingFees(td.nextElementSibling().text().trim());
        } else if (section.matches("Ausweis g.ltig bis")) {
            res.setValidUntil(td.nextElementSibling().text().trim());
        }
    }
    assert (lent_cnt >= 0);
    assert (res_cnt >= 0);
    if (lent_link == null)
        return null;

    String lent_html = httpGet(opac_url + "/" + lent_link.replace("utf-8?Method", "utf-8&Method"),
            getDefaultEncoding());
    Document lent_doc = Jsoup.parse(lent_html);
    List<Map<String, String>> lent = new ArrayList<Map<String, String>>();

    SimpleDateFormat sdf = new SimpleDateFormat("dd/MM/yyyy", Locale.GERMAN);
    Pattern id_pat = Pattern.compile("javascript:renewItem\\('[0-9]+','(.*)'\\)");

    for (Element table : lent_doc
            .select(".LoansBrowseItemDetailsCellStripe table, .LoansBrowseItemDetailsCell table")) {
        Map<String, String> item = new HashMap<String, String>();

        for (Element tr : table.select("tr")) {
            String desc = tr.select(".LoanBrowseFieldNameCell").text().trim();
            String value = tr.select(".LoanBrowseFieldDataCell").text().trim();
            if (desc.equals("Titel"))
                item.put(AccountData.KEY_LENT_TITLE, value);
            if (desc.equals("Verfasser"))
                item.put(AccountData.KEY_LENT_AUTHOR, value);
            if (desc.equals("Mediennummer"))
                item.put(AccountData.KEY_LENT_BARCODE, value);
            if (desc.equals("ausgeliehen in"))
                item.put(AccountData.KEY_LENT_BRANCH, value);
            if (desc.matches("F.+lligkeits.*datum")) {
                value = value.split(" ")[0];
                item.put(AccountData.KEY_LENT_DEADLINE, value);
                try {
                    item.put(AccountData.KEY_LENT_DEADLINE_TIMESTAMP,
                            String.valueOf(sdf.parse(value).getTime()));
                } catch (ParseException e) {
                    e.printStackTrace();
                }
            }
        }
        if (table.select(".button[Title~=Zum]").size() == 1) {
            Matcher matcher1 = id_pat.matcher(table.select(".button[Title~=Zum]").attr("href"));
            if (matcher1.matches()) {
                item.put(AccountData.KEY_LENT_LINK, matcher1.group(1));
            }
        }
        lent.add(item);
    }
    res.setLent(lent);
    assert (lent_cnt <= lent.size());

    List<Map<String, String>> reservations = new ArrayList<Map<String, String>>();
    String res_html = httpGet(opac_url + "/" + res_link, getDefaultEncoding());
    Document res_doc = Jsoup.parse(res_html);

    for (Element table : res_doc
            .select(".MessageBrowseItemDetailsCell table, .MessageBrowseItemDetailsCellStripe table")) {
        Map<String, String> item = new HashMap<String, String>();

        for (Element tr : table.select("tr")) {
            String desc = tr.select(".MessageBrowseFieldNameCell").text().trim();
            String value = tr.select(".MessageBrowseFieldDataCell").text().trim();
            if (desc.equals("Titel"))
                item.put(AccountData.KEY_RESERVATION_TITLE, value);
            if (desc.equals("Publikationsform"))
                item.put(AccountData.KEY_RESERVATION_FORMAT, value);
            if (desc.equals("Liefern an"))
                item.put(AccountData.KEY_RESERVATION_BRANCH, value);
            if (desc.equals("Status"))
                item.put(AccountData.KEY_RESERVATION_READY, value);
        }
        if ("Gelscht".equals(item.get(AccountData.KEY_RESERVATION_READY))) {
            continue;
        }
        reservations.add(item);
    }
    res.setReservations(reservations);
    assert (reservations.size() >= res_cnt);

    return res;
}

From source file:com.fluidops.iwb.provider.HTMLProvider.java

@Override
public void gather(List<Statement> res) throws Exception {

    String url = config.url;/*  w  w w  . ja  v a  2  s  .com*/
    Document doc = Jsoup.connect(url).get();
    Elements links = doc.select("a[href]");
    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
    // Elements article =
    // doc.select("div.wrapper").select("div.box-shadow").select("div#content.cols").select("div.cl").select("div.crm").select("article").select("section.article").select("div.textblock").select("table");
    Elements article = doc.getElementsByTag("tbody").select("tr");
    Elements tableElem;
    URI nameURI = null;
    URI roadsURI = null;
    URI sideURI = null;
    URI totalURI = null;

    File file = new File("HTMLdata.txt");
    PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file)));

    out.println("Media");
    print("\nMedia: (%d)", media.size());
    for (Element el : media) {
        if (el.tagName().equals("img")) {
            print(" * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.printf(" \n * %s: <%s> %sx%s (%s)", el.tagName(), el.attr("abs:src"), el.attr("width"),
                    el.attr("height"), trim(el.attr("alt"), 20));
            out.println();
        } else {
            print(" * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.printf(" \n * %s: <%s>", el.tagName(), el.attr("abs:src"));
            out.println();
        }

    }

    out.println("Imports");
    print("\nImports: (%d)", imports.size());
    for (Element link : imports) {
        print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.printf(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
        out.println();
    }

    out.println("Links");
    print("\nLinks: (%d)", links.size());
    for (Element link : links) {
        print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
        out.printf(" * a: <%s> (%s)", link.attr("abs:href"), link.text());
        out.println();
    }

    /*
     * out.println("Custom text"); print("\nCustom: (%d)",customArt.size());
     * for (Element custom:customArt){
     * out.printf(" * a (%s): (%s)",custom.tagName(),custom.text());
     * out.println(); }
     */

    out.println("Article");
    print("\nArticle: (%d)", article.size());

    for (int i = 3; i < article.size() - 2; i++) {
        tableElem = article.get(i).select("td");
        out.println();

        if (i == 3) {
            nameURI = ProviderUtils.objectToUri(tableElem.get(0).text());
            roadsURI = ProviderUtils.objectToUri(tableElem.get(1).text());
            sideURI = ProviderUtils.objectToUri(tableElem.get(2).text());
            totalURI = ProviderUtils.objectToUri(tableElem.get(3).text());

        } else {

            res.add(ProviderUtils.createStatement(ProviderUtils.objectToUri(tableElem.get(0).text()), RDF.TYPE,
                    nameURI));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    RDFS.LABEL, tableElem.get(0).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    roadsURI, tableElem.get(1).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    sideURI, tableElem.get(2).text()));
            res.add(ProviderUtils.createLiteralStatement(ProviderUtils.objectToUri(tableElem.get(0).text()),
                    totalURI, tableElem.get(3).text()));

            for (Element el : tableElem) {
                out.printf("\n * (%s): (%s)", el.tagName(), el.text());
                out.println();

            }
        }
        out.println();
        out.printf("\n * a (%s) (%d): (%s)", article.get(i).tagName(), tableElem.size(), article.get(i).text());
        out.println();
    }
    out.close();
}

From source file:be.ibridge.kettle.jsoup.JsoupInput.java

private Object[] buildRow() throws KettleException {
    // Create new row...
    Object[] outputRowData = buildEmptyRow();

    if (data.readrow != null)
        outputRowData = data.readrow.clone();

    // Read fields...
    for (int i = 0; i < data.nrInputFields; i++) {
        // Get field
        JsoupInputField field = meta.getInputFields()[i];

        // get jsoup array for field
        Elements jsoupa = data.resultList.get(i);
        String nodevalue = null;//  w w  w .  j  a  va  2  s .c o m
        if (jsoupa != null) {
            Element jo = jsoupa.get(data.recordnr);
            if (jo != null) {

                // Do Element Type
                switch (field.getElementType()) {
                case JsoupInputField.ELEMENT_TYPE_NODE:
                    // Do Result Type
                    switch (field.getResultType()) {
                    case JsoupInputField.RESULT_TYPE_TEXT:
                        nodevalue = jo.text();
                        break;
                    case JsoupInputField.RESULT_TYPE_TYPE_OUTER_HTML:
                        nodevalue = jo.outerHtml();
                        break;
                    case JsoupInputField.RESULT_TYPE_TYPE_INNER_HTML:
                        nodevalue = jo.html();
                        break;
                    default:
                        nodevalue = jo.toString();
                        break;
                    }
                    break;
                case JsoupInputField.ELEMENT_TYPE_ATTRIBUT:
                    nodevalue = jo.attr(field.getAttribute());
                    break;
                default:
                    nodevalue = jo.toString();
                    break;
                }
            }
        }

        // Do trimming
        switch (field.getTrimType()) {
        case JsoupInputField.TYPE_TRIM_LEFT:
            nodevalue = Const.ltrim(nodevalue);
            break;
        case JsoupInputField.TYPE_TRIM_RIGHT:
            nodevalue = Const.rtrim(nodevalue);
            break;
        case JsoupInputField.TYPE_TRIM_BOTH:
            nodevalue = Const.trim(nodevalue);
            break;
        default:
            break;
        }

        if (meta.isInFields()) {
            // Add result field to input stream
            outputRowData = RowDataUtil.addValueData(outputRowData, data.totalpreviousfields + i, nodevalue);
        }
        // Do conversions
        //
        ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i);
        ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i);
        outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta, nodevalue);

        // Do we need to repeat this field if it is null?
        if (meta.getInputFields()[i].isRepeated()) {
            if (data.previousRow != null && Const.isEmpty(nodevalue)) {
                outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i];
            }
        }
    } // End of loop over fields...   

    int rowIndex = data.nrInputFields;

    // See if we need to add the filename to the row...
    if (meta.includeFilename() && !Const.isEmpty(meta.getFilenameField())) {
        outputRowData[rowIndex++] = data.filename;
    }
    // See if we need to add the row number to the row...  
    if (meta.includeRowNumber() && !Const.isEmpty(meta.getRowNumberField())) {
        outputRowData[rowIndex++] = new Long(data.rownr);
    }
    // Possibly add short filename...
    if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) {
        outputRowData[rowIndex++] = data.shortFilename;
    }
    // Add Extension
    if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) {
        outputRowData[rowIndex++] = data.extension;
    }
    // add path
    if (meta.getPathField() != null && meta.getPathField().length() > 0) {
        outputRowData[rowIndex++] = data.path;
    }
    // Add Size
    if (meta.getSizeField() != null && meta.getSizeField().length() > 0) {
        outputRowData[rowIndex++] = new Long(data.size);
    }
    // add Hidden
    if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) {
        outputRowData[rowIndex++] = new Boolean(data.path);
    }
    // Add modification date
    if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) {
        outputRowData[rowIndex++] = data.lastModificationDateTime;
    }
    // Add Uri
    if (meta.getUriField() != null && meta.getUriField().length() > 0) {
        outputRowData[rowIndex++] = data.uriName;
    }
    // Add RootUri
    if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) {
        outputRowData[rowIndex++] = data.rootUriName;
    }
    data.recordnr++;

    RowMetaInterface irow = getInputRowMeta();

    data.previousRow = irow == null ? outputRowData : (Object[]) irow.cloneRow(outputRowData); // copy it to make
    // surely the next step doesn't change it in between...

    return outputRowData;
}

From source file:org.shareok.data.sagedata.SageSourceDataHandlerImpl.java

private String[] getArticleKeyWordsFromFullTextDoc(Document doc) throws NoHtmlComponentsFoundException {
    String[] keys = null;//  w w  w  .jav  a2s.co  m

    Elements keyElements = doc.select("div.hlFld-KeywordText");
    if (null == keyElements || keyElements.isEmpty()) {
        return null;
    }
    Elements keyLinkElements = keyElements.get(0).select("a");
    if (null == keyLinkElements || keyLinkElements.isEmpty()) {
        return null;
    }

    List<String> keyList = new ArrayList<>();
    for (Element link : keyLinkElements) {
        keyList.add(link.text());
    }

    if (keyList.size() > 0) {
        keys = keyList.toArray(new String[keyList.size()]);
    }

    return keys;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * remove any divs that looks like non-content, clusters of links, or paras with no gusto
 *
 * @param node/*from  w w w.j  av  a2s . c om*/
 * @return
 */
private Element cleanupNode(Element node) {
    if (logger.isDebugEnabled()) {
        logger.debug("Starting cleanup Node");
    }

    node = addSiblings(node);

    Elements nodes = node.children();
    for (Element e : nodes) {
        if (e.tagName().equals("p")) {
            continue;
        }
        if (logger.isDebugEnabled()) {
            logger.debug("CLEANUP  NODE: " + e.id() + " class: " + e.attr("class"));
        }
        boolean highLinkDensity = isHighLinkDensity(e);
        if (highLinkDensity) {
            if (logger.isDebugEnabled()) {
                logger.debug("REMOVING  NODE FOR LINK DENSITY: " + e.id() + " class: " + e.attr("class"));
            }
            e.remove();
            continue;
        }
        // now check for word density
        // grab all the paragraphs in the children and remove ones that are too small to matter
        Elements subParagraphs = e.getElementsByTag("p");

        for (Element p : subParagraphs) {
            if (p.text().length() < 25) {
                p.remove();
            }
        }

        // now that we've removed shorty paragraphs let's make sure to exclude any first paragraphs that don't have paras as
        // their next siblings to avoid getting img bylines
        // first let's remove any element that now doesn't have any p tags at all
        Elements subParagraphs2 = e.getElementsByTag("p");
        if (subParagraphs2.size() == 0 && !e.tagName().equals("td")) {
            if (logger.isDebugEnabled()) {
                logger.debug("Removing node because it doesn't have any paragraphs");
            }
            e.remove();
            continue;
        }

        //if this node has a decent enough gravityScore we should keep it as well, might be content
        int topNodeScore = getScore(node);
        int currentNodeScore = getScore(e);
        float thresholdScore = (float) (topNodeScore * .08);
        if (logger.isDebugEnabled()) {
            logger.debug("topNodeScore: " + topNodeScore + " currentNodeScore: " + currentNodeScore
                    + " threshold: " + thresholdScore);
        }
        if (currentNodeScore < thresholdScore) {
            if (!e.tagName().equals("td")) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Removing node due to low threshold score");
                }
                e.remove();
            } else {
                if (logger.isDebugEnabled()) {
                    logger.debug("Not removing TD node");
                }
            }

            continue;
        }

    }

    return node;

}

From source file:org.shareok.data.sagedata.SageJournalIssueDateProcessor.java

public Map<String, Map<String, String>> updateSageJournalLinks(Map<String, Map<String, String>> journalMap) {
    Document doc = null;//from   w  w w .  j  a  v a  2  s  .c om
    try {
        doc = Jsoup.connect("http://journals.sagepub.com/action/showPublications?pageSize=20&startPage=199")
                .userAgent(
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36")
                .cookie("auth", "token").timeout(300000).get();
        Elements trs = doc.select("form#browsePublicationsForm").get(0).select("table").get(0).select("tbody")
                .get(0).select("tr");
        for (Element tr : trs) {
            Element link = tr.select("td").get(1).select("a").get(0);
            String journalName = link.text();
            String journalLink = SageDataUtil.SAGE_HTTP_PREFIX + link.attr("href");
            String[] linkInfo = journalLink.split("/");
            String journalIssuesLink = SageDataUtil.SAGE_HTTP_PREFIX + "/loi/" + linkInfo[linkInfo.length - 1];
            if (null == journalMap.get(journalName)) {
                Map<String, String> infoMap = new HashMap<>();
                infoMap.put("homeLink", journalLink);
                infoMap.put("issueLink", journalIssuesLink);
                journalMap.put(journalName, infoMap);
            } else {
                Map<String, String> infoMap = journalMap.get(journalName);
                if (null == infoMap.get("homeLink")) {
                    infoMap.put("homeLink", journalLink);
                }
                if (null == infoMap.get("issueLink")) {
                    infoMap.put("issueLink", journalIssuesLink);
                }
            }
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
    return journalMap;
}

From source file:crawler.HackerEarthCrawler.java

@Override
public void crawl() {

    int flag = 0;

    //set of urls which should be crawled
    TreeSet<String> linksset = new TreeSet<String>();
    TreeSet<String> tempset = new TreeSet<String>();
    TreeSet<String> tutorialset = new TreeSet<String>();
    //final set of problem urls
    TreeSet<String> problemset = new TreeSet<String>();
    //visited for maintaing status of if url is already crawled or not
    TreeMap<String, Integer> visited = new TreeMap<String, Integer>();

    //add base url
    linksset.add(baseUrl);/*  w w  w  .jav  a 2 s.co  m*/
    //mark base url as not crawled
    visited.put(baseUrl, 0);

    try {
        while (true) {
            flag = 0;
            tempset.clear();

            for (String str : linksset) {
                //check if url is already crawled or not and it has valid domain name
                if ((visited.get(str) == 0) && (str.startsWith("https://www.hackerearth.com/"))) {
                    System.out.println("crawling  " + str);

                    //retriving response of current url as document
                    Document doc = Jsoup.connect(str).timeout(0).userAgent(
                            "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0")
                            .referrer("http://www.google.com").ignoreHttpErrors(true).get();
                    //retriving all urls from current page
                    Elements links = doc.select("a[href]");

                    //mark url as crawled
                    visited.put(str, 1);

                    //mark flag as url is crawled
                    flag = 1;
                    //retrive all urls
                    for (Element link : links) {
                        if (link.absUrl("href").endsWith("/tutorial/")) {
                            tutorialset.add(link.absUrl("href"));
                        }
                        //check if url is problem url then add it in problemurlset
                        if (link.absUrl("href").startsWith("https://www.hackerearth.com/")
                                && isProblemUrl(link.absUrl("href"))) {
                            problemset.add(link.absUrl("href"));
                        }
                        //check if url has valid domain and it has problem urls or not
                        if (link.absUrl("href").contains(("https://www.hackerearth.com/"))
                                && isCrawlable(link.absUrl("href"))) {
                            //if link is not visited then mark it as uncrawled
                            if (!visited.containsKey(link.absUrl("href"))) {
                                visited.put(link.absUrl("href"), 0);
                            }
                            //add it in tempsetorary set
                            tempset.add(link.absUrl("href"));
                            //System.out.println("\n  base: "+str+" ::: link  : " + link.absUrl("href"));
                        }
                    }
                }
            }
            //if nothing is left to crawl break the loop
            if (flag == 0) {
                break;
            }
            //add all retrieved links to linksset
            linksset.addAll(tempset);
        }

        System.out.println("\n\ntotal problem urls " + problemset.size());

        int i = 0;
        for (String str : problemset) {
            System.out.println("link " + i + " : " + str);
            i++;
        }

    } catch (IOException ex) {
        Logger.getLogger(HackerEarthCrawler.class.getName()).log(Level.SEVERE, null, ex);
    }

    //scrap and store into database
    //for every problem url scrap problem page
    for (String problemUrl : problemset) {

        System.out.println("problemUrl :" + problemUrl);
        try {
            //create problem class to store in database
            Problem problem = new Problem();
            String problemSIOC = "", problemIOC = "";
            String problemTitle = "", problemStatement = "", problemInput = "", problemOutput = "",
                    problemConstraints = "";
            String sampleInput = "", sampleOutput = "";
            String problemExplanation = "";
            //set default timelimit to 1 second
            double problemTimeLimit = 1.0;
            ArrayList<String> tags = new ArrayList<String>();

            //get response for given problem url
            Response response = Jsoup.connect(problemUrl).execute();
            Document doc = response.parse();

            //retrieve problem title from page
            Element elementTitle = doc.getElementsByTag("title").first();
            StringTokenizer stTitle = new StringTokenizer(elementTitle.text(), "|");
            problemTitle = stTitle.nextToken().trim();

            Element content = doc.getElementsByClass("starwars-lab").first();
            problemSIOC = content.text();
            Elements e = content.children();

            //to find problem statement
            String breakloop[] = { "input", "input:", "input :", "input format:", "input format :",
                    "input format", "Input and output", "constraints :", "constraints:", "constraints",
                    "$$Input :$$" };
            flag = 0;
            for (Element p : e) {
                String tempStatement = "";
                for (Element pp : p.getAllElements()) {

                    for (String strbreak : breakloop) {
                        if (StringUtils.equalsIgnoreCase(pp.ownText(), strbreak)) {
                            //System.out.println("strbreak :"+strbreak);

                            tempStatement = p.text().substring(0,
                                    p.text().toLowerCase().indexOf(strbreak.toLowerCase()));
                            // System.out.println("temp "+tempStatement);
                            flag = 1;
                            break;
                        }
                    }
                }

                if (flag == 1) {
                    problemStatement += tempStatement;
                    //remove extra space at end
                    if (tempStatement.length() == 0) {
                        problemStatement = problemStatement.substring(0, problemStatement.length() - 1);
                    }
                    break;
                }
                problemStatement += p.text() + " ";
            }

            System.out.println("problemSIOC :" + problemSIOC);
            System.out.println("problemStatement :" + problemStatement);

            if (problemStatement.length() <= problemSIOC.length()) {
                //remove problem statement from whole text and remove extra spaces at the beginning and the end
                problemIOC = problemSIOC.substring(problemStatement.length()).trim();
            } else {
                problemIOC = "";
            }

            System.out.println("problemIOC :" + problemIOC);

            //keywords for identifying input
            String decideInput[] = { "Input format :", "Input format:", "Input format", "inputformat:",
                    "inputformat :", "inputformat", "input and output", "input :", "input:", "input" };
            //keywords for identifying output
            String decideOutput[] = { "output format :", "output format:", "Output format", "outputformat:",
                    "outputformat :", "outputformat", "output :", "output:", "output" };
            //keywords for identifying constraint
            String decideConstraint[] = { "constraints:", "constraints :", "constraints", "Constraints :",
                    "constraint:", "constraint :", "constraint", "Contraints :" };

            int posin = 0, posoutput = 0, poscon = 0, idxin, idxout, idxcon, flaginput = 0, flagoutput = 0,
                    flagcon = 0, inlen = 0, outlen = 0, conlen = 0;

            //find inputformat position,length of keyword
            for (idxin = 0; idxin < decideInput.length; idxin++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideInput[idxin])) {

                    posin = problemIOC.toLowerCase().indexOf(decideInput[idxin].toLowerCase());
                    flaginput = 1;
                    inlen = decideInput[idxin].length();

                    //decide it is keyowrd for actucal input or it is "sample input"
                    if (StringUtils.containsIgnoreCase(problemIOC, "sample input")) {
                        if (posin > problemIOC.toLowerCase().indexOf("sample input")) {
                            flaginput = 0;
                            inlen = 0;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
            }

            //find outputformat position,length of keyword
            for (idxout = 0; idxout < decideOutput.length; idxout++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideOutput[idxout])) {
                    posoutput = problemIOC.toLowerCase().indexOf(decideOutput[idxout].toLowerCase());
                    flagoutput = 1;
                    outlen = decideOutput[idxout].length();
                    break;
                }
            }

            //find constraint position,length of keyword
            for (idxcon = 0; idxcon < decideConstraint.length; idxcon++) {
                if (StringUtils.containsIgnoreCase(problemIOC, decideConstraint[idxcon])) {
                    poscon = problemIOC.toLowerCase().indexOf(decideConstraint[idxcon].toLowerCase());
                    flagcon = 1;
                    conlen = decideConstraint[idxcon].length();
                    break;
                }
            }

            System.out.println("input " + flaginput + " " + inlen + " " + posin);
            System.out.println("output " + flagoutput + " " + outlen + " " + posoutput);
            System.out.println("constraint " + flagcon + " " + conlen + " " + poscon);
            //retrieve problem input and output if present in problem page

            //if input format is present
            if (flaginput == 1) {
                //if input keyword is "input and output" and contraint is present in problem page
                if (idxin == 6 && flagcon == 1) {
                    problemInput = problemIOC.substring(inlen, poscon);
                }
                //if input keyword is "input and output" and contraint is not present in problem page
                else if (idxin == 6 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen);
                }
                //if output format and constraint is present
                else if (flagoutput == 1 && flagcon == 1) {
                    //if constraint is present before input format
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    }
                    //if constraint is present before sample
                    else if (poscon < posoutput) {
                        problemInput = problemIOC.substring(inlen, poscon);
                        problemOutput = problemIOC.substring(posoutput + outlen);
                    } else {
                        problemInput = problemIOC.substring(inlen, posoutput);
                        problemOutput = problemIOC.substring(posoutput + outlen, poscon);
                    }
                }
                //if constraint is not present
                else if (flagoutput == 1 && flagcon == 0) {
                    problemInput = problemIOC.substring(inlen, posoutput);
                    problemOutput = problemIOC.substring(posoutput + outlen);
                } else if (flagoutput == 0 && flagcon == 1) {
                    if (poscon < posin) {
                        problemInput = problemIOC.substring(posin + inlen);
                    } else {
                        problemInput = problemIOC.substring(poscon + conlen, posin);
                    }
                    problemOutput = "";
                } else {
                    problemInput = problemIOC.substring(inlen);
                    problemOutput = "";
                }
            }
            //if input format and output format is not present
            else {
                problemInput = "";
                problemOutput = "";
            }

            //if constraint is present
            if (flagcon == 1) {
                //if constraint is present before input format
                if (poscon < posin) {
                    problemConstraints = problemIOC.substring(0, posin);
                }
                //if constraint is present before output format
                else if (poscon < posoutput) {
                    problemConstraints = problemIOC.substring(poscon + conlen, posoutput);
                } else {
                    problemConstraints = problemIOC.substring(poscon + conlen);
                }
            }

            System.out.println("problemInput :" + problemInput);
            System.out.println("problemOutput :" + problemOutput);
            System.out.println("problemConstraints :" + problemConstraints);

            //retrieve problem tags from problem page
            Element elementtag = doc.getElementsByClass("problem-tags").first().child(1);
            StringTokenizer st = new StringTokenizer(elementtag.text(), ",");
            while (st.hasMoreTokens()) {
                tags.add(st.nextToken().trim());
            }

            //retrieve sample input sample output if present
            Element elementSIO = doc.getElementsByClass("input-output-container").first();
            //if sample input output is present
            if (elementSIO != null) {
                //find position of sample output
                int soutpos = elementSIO.text().indexOf("SAMPLE OUTPUT");
                sampleInput = elementSIO.text().substring(12, soutpos);
                sampleOutput = elementSIO.text().substring(soutpos + 13);
                System.out.println("Sample input :\n" + sampleInput + "\n\n\n");
                System.out.println("Sample Output :\n" + sampleOutput);
            } else {
                sampleInput = "";
                sampleOutput = "";
            }

            //retrieve problem explanation from problem page if present
            Element elementExplanation = doc.getElementsByClass("standard-margin").first().child(0);
            if (elementExplanation.text().toLowerCase().contains("explanation")) {
                problemExplanation = elementExplanation.nextElementSibling().text();
            }
            System.out.println("Explanation :" + problemExplanation);

            //retrieve timelimit
            Element elementTL = doc.getElementsByClass("problem-guidelines").first().child(0).child(1);
            StringTokenizer stTL = new StringTokenizer(elementTL.ownText(), " ");
            problemTimeLimit = Double.parseDouble(stTL.nextToken());

            //System.out.println("problemTimeLimit :"+problemTimeLimit);
            //set all retrieved information to problem class
            problem.setProblemUrl(problemUrl);
            if (problemTitle.length() == 0) {
                problemTitle = null;
            }
            if (problemStatement.length() == 0) {
                problemStatement = null;
            }
            if (problemInput.length() == 0) {
                problemInput = null;
            }
            if (problemOutput.length() == 0) {
                problemOutput = null;
            }
            if (problemExplanation.length() == 0) {
                problemExplanation = null;
            }
            if (problemConstraints.length() == 0) {
                problemConstraints = null;
            }
            problem.setTitle(problemTitle);
            problem.setProblemUrl(problemUrl);
            problem.setProblemStatement(problemStatement);
            problem.setInputFormat(problemInput);
            problem.setOutputFormat(problemOutput);
            problem.setTimeLimit(problemTimeLimit);
            problem.setExplanation(problemExplanation);
            problem.setConstraints(problemConstraints);

            //set sample input output to problem class
            SampleInputOutput sampleInputOutput = new SampleInputOutput(problem, sampleInput, sampleOutput);
            problem.getSampleInputOutputs().add(sampleInputOutput);
            //set platform as hackerearth
            problem.setPlatform(Platform.HackerEarth);
            for (String strtag : tags) {
                problem.getTags().add(strtag);
            }

            //store in database
            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Problem p where p.problemUrl = :problem_url";
                Problem oldProblem = (Problem) session.createQuery(hql).setString("problem_url", problemUrl)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    problem.setId(oldProblem.getId());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(problem);
                } else {
                    task = "saved";
                    session.save(problem);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, problem.getProblemUrl() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + problemUrl, e);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }
        } catch (Exception ee) {
            System.out.println(ee.toString());
        }
    }

    System.out.println("\n\n\n\ntutorial urls\n\n");
    try {

        for (String tutorialurl : tutorialset) {
            //System.out.println(tutorialurl+"\n\n");
            Response tutorialres = Jsoup.connect(tutorialurl).execute();
            Document doc = tutorialres.parse();

            Tutorial tutorial = new Tutorial();
            tutorial.setContent(doc.getElementsByClass("tutorial").first().text());

            tutorial.setName(baseUrl);
            tutorialurl = tutorialurl.substring(0, tutorialurl.length() - 10);
            StringTokenizer tutorialtok = new StringTokenizer(tutorialurl, "/");

            String tempstr = "";
            while (tutorialtok.hasMoreTokens()) {
                tempstr = tutorialtok.nextToken();
            }

            Session session = null;
            Transaction transaction = null;
            try {
                //start session
                session = HibernateUtil.getSessionFactory().openSession();
                transaction = session.beginTransaction();

                //check if problem is already stored in database
                String hql = "FROM Tutorial p where p.name = :name";
                Tutorial oldProblem = (Tutorial) session.createQuery(hql).setString("name", tempstr)
                        .uniqueResult();
                String task;

                //if problem is present in database
                if (oldProblem != null) {
                    //update the old problem
                    task = "updated";
                    //retrieve id of old problem
                    tutorial.setName(oldProblem.getName());
                    session.delete(oldProblem);
                    session.flush();
                    session.save(tutorial);
                } else {
                    task = "saved";
                    tutorial.setName(tempstr);
                    session.save(tutorial);
                }

                transaction.commit();
                //log the info to console
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.INFO, "{0} {1}",
                        new Object[] { task, tutorial.getName() });
            } catch (HibernateException ee) {
                if (transaction != null) {
                    transaction.rollback();
                }
                Logger.getLogger(CodeForcesCrawler.class.getName()).log(Level.SEVERE,
                        "Cannot Insert/Update problem into databse: " + tempstr, ee);
            } finally {
                //close the session
                if (session != null) {
                    session.close();
                }
            }

        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    }
}

From source file:org.shareok.data.sagedata.SageSourceDataHandlerImpl.java

private String getPublisherFromFullTextDoc(Document doc) throws NoHtmlComponentsFoundException {
    String publisher = null;//from w w w . j  a va 2s  .c  om

    Elements headerTitleContainerElements = doc.select("div#headerTitleContainer");
    if (null == headerTitleContainerElements || headerTitleContainerElements.isEmpty()) {
        throw new NoHtmlComponentsFoundException("Cannot find headerTitleContainer");
    }
    Element headerTitleContainer = headerTitleContainerElements.get(0);
    String pub = headerTitleContainer.text();
    if (null != pub) {
        publisher = pub;
    }

    return publisher;
}

From source file:de.geeksfactory.opacclient.apis.Heidi.java

@Override
public DetailledItem getResultById(String id, final String homebranch) throws IOException {

    if (sessid == null) {
        start();/*www  .  j  a  va 2s  . c om*/
    }

    // Homebranch
    if (homebranch != null && !"".equals(homebranch)) {
        cookieStore.addCookie(new BasicClientCookie("zweig", homebranch));
    }

    String html = httpGet(opac_url + "/titel.cgi?katkey=" + id + "&sess=" + sessid, ENCODING, false,
            cookieStore);
    Document doc = Jsoup.parse(html);

    DetailledItem item = new DetailledItem();
    item.setId(id);

    Elements table = doc.select(".titelsatz tr");
    for (Element tr : table) {
        if (tr.select("th").size() == 0 || tr.select("td").size() == 0) {
            continue;
        }
        String d = tr.select("th").first().text();
        String c = tr.select("td").first().text();
        if (d.equals("Titel:")) {
            item.setTitle(c);
        } else if ((d.contains("URL") || d.contains("Link")) && tr.select("td a").size() > 0) {
            item.addDetail(new Detail(d, tr.select("td a").first().attr("href")));
        } else {
            item.addDetail(new Detail(d, c));
        }
    }

    if (doc.select(".ex table tr").size() > 0) {
        table = doc.select(".ex table tr");
        DateTimeFormatter fmt = DateTimeFormat.forPattern("dd.MM.yyyy").withLocale(Locale.GERMAN);
        for (Element tr : table) {
            if (tr.hasClass("exueber") || tr.select(".exsig").size() == 0 || tr.select(".exso").size() == 0
                    || tr.select(".exstatus").size() == 0) {
                continue;
            }
            Copy copy = new Copy();
            copy.setShelfmark(tr.select(".exsig").first().text());
            copy.setBranch(tr.select(".exso").first().text());
            String status = tr.select(".exstatus").first().text();
            if (status.contains("entliehen bis")) {
                copy.setReturnDate(fmt.parseLocalDate(status.replaceAll("entliehen bis ([0-9.]+) .*", "$1")));
                copy.setReservations(status.replaceAll(".*\\(.*Vormerkungen: ([0-9]+)\\)", "$1"));
                copy.setStatus("entliehen");
            } else {
                copy.setStatus(status);
            }
            item.addCopy(copy);
        }
    }

    for (Element a : doc.select(".status1 a")) {
        if (a.attr("href").contains("bestellung.cgi")) {
            item.setReservable(true);
            item.setReservation_info(id);
            break;
        }
    }
    for (Element a : doc.select(".titelsatz a")) {
        if (a.text().trim().matches("B.+nde")) {
            Map<String, String> volumesearch = new HashMap<>();
            volumesearch.put("query", getQueryParamsFirst(a.attr("href")).get("query"));
            item.setVolumesearch(volumesearch);
        }
    }

    return item;
}

From source file:com.jimplush.goose.ContentExtractor.java

/**
 * adds any siblings that may have a decent score to this node
 *
 * @param node//ww w  .  j  a v a 2s  . c  o  m
 * @return
 */
private Element addSiblings(Element node) {
    if (logger.isDebugEnabled()) {
        logger.debug("Starting to add siblings");
    }
    int baselineScoreForSiblingParagraphs = getBaselineScoreForSiblings(node);

    Element currentSibling = node.previousElementSibling();
    while (currentSibling != null) {
        if (logger.isDebugEnabled()) {
            logger.debug("SIBLINGCHECK: " + debugNode(currentSibling));
        }

        if (currentSibling.tagName().equals("p")) {

            node.child(0).before(currentSibling.outerHtml());
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }

        // check for a paraph embedded in a containing element
        int insertedSiblings = 0;
        Elements potentialParagraphs = currentSibling.getElementsByTag("p");
        if (potentialParagraphs.first() == null) {
            currentSibling = currentSibling.previousElementSibling();
            continue;
        }
        for (Element firstParagraph : potentialParagraphs) {
            WordStats wordStats = StopWords.getStopWordCount(firstParagraph.text());

            int paragraphScore = wordStats.getStopWordCount();

            if ((float) (baselineScoreForSiblingParagraphs * .30) < paragraphScore) {
                if (logger.isDebugEnabled()) {
                    logger.debug("This node looks like a good sibling, adding it");
                }
                node.child(insertedSiblings).before("<p>" + firstParagraph.text() + "<p>");
                insertedSiblings++;
            }

        }

        currentSibling = currentSibling.previousElementSibling();
    }
    return node;

}