List of usage examples for org.jsoup.nodes Element html
public String html()
From source file:com.github.hronom.scrape.dat.website.controllers.ScrapeButtonController.java
public void processByHtmlUnit() { // Disable fields in view. scrapeView.setWebsiteUrlTextFieldEnabled(false); scrapeView.setSelectorTextFieldEnabled(false); scrapeView.setScrapeButtonEnabled(false); scrapeView.setWorkInProgress(true);/*from ww w . jav a2 s.c o m*/ scrapeView.setOutput(""); scrapeView.setProgressBarTaskText("initializing"); logger.info("Start processing..."); long beginTime = System.currentTimeMillis(); // Output input parameters. if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { logger.info("Input parameters: \"" + scrapeView.getWebsiteUrl() + "\", \"" + scrapeView.getSelector() + "\", \""); } // Process. try { URL url = new URL(scrapeView.getWebsiteUrl()); scrapeView.setProgressBarTaskText("requesting page"); logger.info("Requesting page..."); HtmlPage page = webClient.getPage(url); logger.info("Requesting of page completed."); scrapeView.setProgressBarTaskText("viewing page as XML"); logger.info("View page as XML"); String xml = page.asXml(); // Unescape html. scrapeView.setProgressBarTaskText("unescaping HTML"); logger.info("Unescape html"); xml = StringEscapeUtils.unescapeHtml4(xml); logger.info("Get selector"); String selector = scrapeView.getSelector(); if (!xml.isEmpty() && !selector.isEmpty()) { scrapeView.setProgressBarTaskText("parsing HTML"); logger.info("Parse HTML"); Document doc = Jsoup.parse(xml); scrapeView.setProgressBarTaskText("selecting elements in HTML"); logger.info("select elements in HTML"); Elements selectedElements = doc.select(selector); if (!selectedElements.isEmpty()) { scrapeView.setProgressBarTaskText("parsing selected elements"); logger.info("Parse extracted elements"); StringBuilder sb = new StringBuilder(); for (Element element : selectedElements) { String body = element.html(); sb.append(body); sb.append("\n"); sb.append("\n"); } scrapeView.setOutput(sb.toString()); } } } catch (Exception e) { logger.error(e); } webClient.close(); long endTime = System.currentTimeMillis(); logger.info("Process time: " + (endTime - beginTime) + " ms."); logger.info("Processing complete."); // Enable fields in view. scrapeView.setWorkInProgress(false); scrapeView.setScrapeButtonEnabled(true); scrapeView.setSelectorTextFieldEnabled(true); scrapeView.setWebsiteUrlTextFieldEnabled(true); }
From source file:net.pixomania.crawler.W3C.parser.rules.editors.EditorsRule6.java
@Override public ArrayList<Person> run(String url, Document doc) { ArrayList<Person> editorList = new ArrayList<>(); Element editor = null; try {// w ww. j av a 2s. c om editor = doc.select(".authlist").get(0).select("p").get(0); } catch (IndexOutOfBoundsException e) { try { editor = doc.select("h4:contains(Editors) ~ p").get(0); } catch (IndexOutOfBoundsException e1) { return null; } } String[] splitted = editor.html().split("<br />"); if (splitted.length < 2) splitted = editor.html().split("<br clear=\"none\" />"); for (String split : splitted) { if (!split.isEmpty()) { if (split.equals("WHATWG:") || split.equals("W3C:")) continue; Document newdoc = Jsoup.parse(split.replaceAll("\n", "")); Person result = NameParser.parse(newdoc.text()); if (result == null) continue; for (int i = 0; i < newdoc.select("a").size(); i++) { if (!newdoc.select("a").get(i).attr("href").isEmpty()) { if (newdoc.select("a").get(i).attr("href").contains("@")) { result.setEmail(newdoc.select("a").get(i).attr("href").replace("mailto:", "")); } else { result.addWebsite(newdoc.select("a").get(i).attr("href")); } } } editorList.add(result); } } if (editorList.size() == 0) return null; return editorList; }
From source file:com.kantenkugel.discordbot.jdocparser.JDocParser.java
static void parse(final String jdocBase, final String name, final InputStream inputStream, Map<String, ClassDocumentation> docs) { final String[] pathSplits = name.split("/"); final String fileName = pathSplits[pathSplits.length - 1]; if (!Character.isUpperCase(fileName.charAt(0))) { //ignore jdoc structure html return;/* w w w. j av a2 s . com*/ } final String[] nameSplits = fileName.split("\\."); final String className = nameSplits[nameSplits.length - 2]; final String fullName = fileName.substring(0, fileName.length() - nameSplits[nameSplits.length - 1].length() - 1); try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream))) { //create dom Document final String content = buffer.lines().collect(Collectors.joining("\n")); Document document = Jsoup.parse(content); //classDocument (classname, package, description) Element titleElem = getSingleElementByClass(document, "title"); final String classSig = JDocUtil.fixSpaces(titleElem.text()); Element packageElem = titleElem.previousElementSibling(); if (packageElem.children().size() > 1) { packageElem = packageElem.children().last(); } final String pack = JDocUtil.fixSpaces(packageElem.text()); final String link = JDocUtil.getLink(jdocBase, pack, fullName); Element descriptionElement = null; Elements descriptionCandidates = document.select(".description .block"); if (descriptionCandidates.size() > 1) { List<Element> removed = descriptionCandidates.stream().map(elem -> elem.child(0)) .filter(child -> child != null && !child.className().startsWith("deprecat")) .map(Element::parent).collect(Collectors.toList()); if (removed.size() != 1) throw new RuntimeException("Found too many description candidates"); descriptionElement = removed.get(0); } else if (descriptionCandidates.size() == 1) { descriptionElement = descriptionCandidates.get(0); } final String description = descriptionElement == null ? "" : JDocUtil.formatText(descriptionElement.html(), link); final ClassDocumentation classDoc = new ClassDocumentation(pack, fullName, classSig, description, classSig.startsWith("Enum")); //methods, fields final Element details = document.getElementsByClass("details").first(); if (details != null) { //methods Element tmp = getSingleElementByQuery(details, "a[name=\"method.detail\"]"); List<DocBlock> docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { Set<MethodDocumentation> mdocs = classDoc.methodDocs .computeIfAbsent(block.title.toLowerCase(), key -> new HashSet<>()); mdocs.add(new MethodDocumentation(classDoc, block.signature, block.hashLink, block.description, block.fields)); } } //vars tmp = getSingleElementByQuery(details, "a[name=\"field.detail\"]"); docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc, block.title, block.hashLink, block.signature, block.description)); } } //enum-values tmp = getSingleElementByQuery(details, "a[name=\"enum.constant.detail\"]"); docBlock = getDocBlock(jdocBase, tmp, classDoc); if (docBlock != null) { for (DocBlock block : docBlock) { classDoc.classValues.put(block.title.toLowerCase(), new ValueDocumentation(classDoc, block.title, block.hashLink, block.signature, block.description)); } } } final Element methodSummary = getSingleElementByQuery(document, "a[name=\"method.summary\"]"); classDoc.inheritedMethods.putAll(getInheritedMethods(methodSummary)); //storing if (nameSplits.length > 2) { if (!docs.containsKey(nameSplits[0].toLowerCase())) docs.put(nameSplits[0].toLowerCase(), new ClassDocumentation(null, null, null, null, false)); ClassDocumentation parent = docs.get(nameSplits[0].toLowerCase()); for (int i = 1; i < nameSplits.length - 2; i++) { if (!parent.subClasses.containsKey(nameSplits[i].toLowerCase())) parent.subClasses.put(nameSplits[i].toLowerCase(), new ClassDocumentation(null, null, null, null, false)); parent = parent.subClasses.get(nameSplits[i].toLowerCase()); } if (parent.subClasses.containsKey(className.toLowerCase())) classDoc.subClasses.putAll(parent.subClasses.get(className.toLowerCase()).subClasses); parent.subClasses.put(className.toLowerCase(), classDoc); } if (docs.containsKey(fullName.toLowerCase())) { ClassDocumentation current = docs.get(fullName.toLowerCase()); if (current.classSig != null) throw new RuntimeException("Got a class-name conflict with classes " + classDoc.classSig + "(" + classDoc.className + ") AND " + current.classSig + "(" + current.className + ")"); classDoc.subClasses.putAll(current.subClasses); } docs.put(fullName.toLowerCase(), classDoc); } catch (final IOException | NullPointerException ex) { JDocUtil.LOG.error("Got excaption for element {}", fullName, ex); } try { inputStream.close(); } catch (final IOException e) { JDocUtil.LOG.error("Error closing inputstream", e); } }
From source file:accountgen.controller.Controller.java
private void setAddress(Document doc, Person p) { Elements e = doc.getElementsByClass("address"); Element ad = e.select(".adr").first(); Address address = new Address(); String streetnumber = StringEscapeUtils .unescapeHtml4(/*ww w . jav a2 s. c o m*/ ad.html().split("<br />")[0].split(" ")[ad.html().split("<br />")[0].split(" ").length - 1]) .trim(); String state = StringEscapeUtils .unescapeHtml4( ad.html().split("<br />")[1].split(" ")[ad.html().split("<br />")[1].split(" ").length - 1]) .trim(); address.setStreetnumber(streetnumber); address.setStreetname(StringEscapeUtils.unescapeHtml4(ad.html().split(streetnumber)[0]).trim()); address.setState(state); address.setPostcode( StringEscapeUtils.unescapeHtml4(ad.html().split("<br />")[1].split(state)[0]).trim().split(" ")[0]); address.setCountry(Consts.COUNTRY); p.setAdress(address); }
From source file:mx.itdurango.rober.siitdocentes.asynctasks.GruposTask.java
/** * Procesa el html resultante de la peticin del listado de grupos descomponiendolo y asignandolo a un ArrayList * * @param html cuerpo html del resultado de la peticin *//* w ww . j a v a2 s .c o m*/ public void procesa(String html) { //se genera un documento donde se almacena el contenido html listo para ser procesado. Document doc = Jsoup.parse(html); //se obtiene la tabla donde se encuentra el contenido que interesa Element tabla = doc.getElementsByTag("table").get(0); //se obtienen todos los renglones de la tabla Elements renglones = tabla.getElementsByTag("tr"); //arraylist que almacenar la informacin de los grupos ArrayList<Grupos> gcs = new ArrayList<Grupos>(); //se recorre cada renglon almacenandolo en un objeto for (Element tr : renglones) { //se obtienen todos los elementos td de cada renglon. Elements tds = tr.getElementsByTag("td"); //lleva el control de la columna que se est evaluando int l = 1; //objeto para lmacenar la informacion de cada grupo Grupos gc = new Grupos(); //se recorren todos los elementos td del renglon actual for (Element td : tds) { //en el renglon 1 se encuentra la informacion del grupo con el siguiente formato //<b> CLAVE_MATERIA </b> <br> NOMBRE DE LA MATERIA if (l == 1) { //se obtiene el contenido de la celda String datos = td.html(); //eliminar las etiquetas de inicio de negritas datos = datos.replaceAll("<b>", ""); //separar la cadena para tener en la posicin 0 la clave de la materia y en la posicion 1 el nombre de la misma. String m[] = datos.split("</b> <br />"); gc.setClave(m[0]); //se asigna la clave de la materia al campo correspondiente gc.setNombre(m[1]);//se asigna el nombre de la materia al campo correspondiente } else if (l == 2) { //en la columna 2 se encuentra el grupo gc.setGrupo(td.html()); } else if (l == 3) { //en la columna 3 se encuentra el numero de alumnos inscritos gc.setAlumnos(td.html()); } else if (l == 4) { //en la columna 4 se encuentran los vinculos para asignar calificaciones parciales con el siguiente formato // <img src="http://siit.itdurango.edu.mx/img/iconos/captura_calif.gif" // onclick="window.location = "calificaciones_parciales.php?periodo=20141&materia=IT8851&grupo=8TA"" // alt="Captura de Informacin" style="cursor:pointer"> //tomamos el contenido de la celda String params = td.html(); //si separamos mediante la cadena """ podemos obtener solamente la url con parmetros que se tiene que mandar llamar String separado[] = params.split("""); params = separado[1]; // solo los parametros params = params.replaceAll("&", "&"); //asignar la url a su campo correspondiente gc.setUrl(params); } //incrementar el numero de columna l++; } //si la clave es nula significa que no es una materia, probablemente sea el encabezado de la tabla if (gc.getClave() != null) gcs.add(gc); } //se genera un adapter nuevo con la informacin obtenida para ser asignado al listview de grupos. context.lvGrupos.setAdapter(new GruposAdapter(context, R.layout.item_grupos, gcs)); }
From source file:me.vertretungsplan.parser.SVPlanParser.java
private void parseSvPlanDay(SubstitutionSchedule v, Element svp, Document doc) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); if ((svp.select(".svp-plandatum-heute, .svp-plandatum-morgen, .Titel").size() > 0 || doc.title().startsWith("Vertretungsplan fr "))) { setDate(svp, doc, day);/*w w w .ja va2 s. co m*/ if (svp.select(".svp-tabelle, table:has(.Klasse)").size() > 0) { Elements rows = svp.select(".svp-tabelle tr, table:has(.Klasse) tr"); String lastLesson = ""; String lastClass = ""; for (Element row : rows) { if ((doc.select(".svp-header").size() > 0 && row.hasClass("svp-header")) || row.select("th").size() > 0 || row.text().trim().equals("")) { continue; } Substitution substitution = new Substitution(); for (Element column : row.select("td")) { String type = column.className(); if (!hasData(column.text())) { if ((type.startsWith("svp-stunde") || type.startsWith("Stunde")) && hasData(lastLesson)) { substitution.setLesson(lastLesson); } else if ((type.startsWith("svp-klasse") || type.startsWith("Klasse")) && hasData(lastClass)) { substitution.getClasses().addAll(Arrays .asList(lastClass.split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); } continue; } if (type.startsWith("svp-stunde") || type.startsWith("Stunde")) { substitution.setLesson(column.text()); lastLesson = column.text(); } else if (type.startsWith("svp-klasse") || type.startsWith("Klasse")) { substitution.getClasses().addAll(Arrays .asList(column.text().split(data.optString(PARAM_CLASS_SEPARATOR, ", ")))); lastClass = column.text(); } else if (type.startsWith("svp-esfehlt") || type.startsWith("Lehrer")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setPreviousTeacher(column.text()); } } else if (type.startsWith("svp-esvertritt") || type.startsWith("Vertretung")) { if (!data.optBoolean(PARAM_EXCLUDE_TEACHERS)) { substitution.setTeacher(column.text().replaceAll(" \\+$", "")); } } else if (type.startsWith("svp-fach") || type.startsWith("Fach")) { substitution.setSubject(column.text()); } else if (type.startsWith("svp-bemerkung") || type.startsWith("Anmerkung")) { substitution.setDesc(column.text()); String recognizedType = recognizeType(column.text()); substitution.setType(recognizedType); substitution.setColor(colorProvider.getColor(recognizedType)); } else if (type.startsWith("svp-raum") || type.startsWith("Raum")) { substitution.setRoom(column.text()); } } if (substitution.getType() == null) { substitution.setType("Vertretung"); substitution.setColor(colorProvider.getColor("Vertretung")); } day.addSubstitution(substitution); } } if (svp.select(".LehrerVerplant").size() > 0) { day.addMessage("<b>Verplante Lehrer:</b> " + svp.select(".LehrerVerplant").text()); } if (svp.select(".Abwesenheiten").size() > 0) { day.addMessage("<b>Abwesenheiten:</b> " + svp.select(".Abwesenheiten").text()); } if (svp.select("h2:contains(Mitteilungen)").size() > 0) { Element h2 = svp.select("h2:contains(Mitteilungen)").first(); Element sibling = h2.nextElementSibling(); while (sibling != null && sibling.tagName().equals("p")) { for (String nachricht : TextNode.createFromEncoded(sibling.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } sibling = sibling.nextElementSibling(); } } else if (svp.select(".Mitteilungen").size() > 0) { for (Element p : svp.select(".Mitteilungen")) { for (String nachricht : TextNode.createFromEncoded(p.html(), null).getWholeText() .split("<br />\\s*<br />")) { if (hasData(nachricht)) day.addMessage(nachricht); } } } v.addDay(day); } else { throw new IOException("keine SVPlan-Tabelle gefunden"); } }
From source file:eu.masconsult.bgbanking.banks.sgexpress.SGExpressClient.java
private RawBankAccount obtainBankAccountFromHtmlTableRow(String type, Element row) { if ("detail".equalsIgnoreCase(row.attr("class"))) { // detail row return null; }/*from w w w . j a va 2 s .com*/ if ("bg0".equalsIgnoreCase(row.attr("class"))) { Log.v(TAG, "working row(" + type + "): " + row.html()); if ("Current Accounts".equalsIgnoreCase(type)) { return new RawBankAccount().setServerId(row.child(2).text()).setName(row.child(0).child(0).text()) .setIBAN(row.child(2).text()).setCurrency(row.child(1).text()) .setBalance(Convert.strToFloat(row.child(3).text())) .setAvailableBalance(Convert.strToFloat(row.child(4).text())); } else if ("Cards".equalsIgnoreCase(type)) { // skip cards for now return null; } else { // unknown type return null; } } else { return null; } }
From source file:org.abondar.experimental.eventsearch.EventFinder.java
public void getEvent(String eventId, String evType) { try {// www . ja va 2s.c o m Document dc = Jsoup.connect("https://afisha.yandex.ru/msk/events/" + eventId + "/").get(); Event eb = new Event(); eb.setEventID(eventId); eb.setCategory(eventTypes.get(evType)); Elements elems = dc.select("meta"); for (Element e : elems) { if (e.attributes().get("property").contains("og:description")) { eb.setDescription(e.attributes().get("content")); } } elems = dc.select("title"); for (Element e : elems) { eb.setName(e.html().substring(0, e.html().indexOf(""))); } elems = dc.select("a[href]"); for (Element e : elems) { for (Attribute attr : e.attributes().asList()) { if (attr.getValue().contains("/msk/places/")) { eb.setPlace(getEventPlaces(attr.getValue())); } } } elems = dc.select("tr[id]"); for (Element e : elems) { for (Attribute attr : e.attributes().asList()) { if (attr.getValue().contains("f")) { eb.setDate(e.children().first().html()); try { Element e1 = e.child(1).children().first(); Element e2 = e1.children().first(); Element e3 = e2.children().first(); Element e4 = e3.children().first(); eb.setTime(e4.html()); } catch (NullPointerException ex) { Element e1 = e.child(2).children().first(); Element e2 = e1.children().first(); Element e3 = e2.children().first(); Element e4 = e3.children().first(); eb.setTime(e4.html()); } } } } geoCode(eb); formJson(eb); } catch (IOException ex) { Logger.getLogger(EventFinder.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:eu.sisob.uma.NPL.Researchers.GateResearcherAnnCollector.java
@SuppressWarnings("unchecked") private void writeResultsInHTMLFile(Document doc, File file_result) { String startTagPart_1 = "<br><span GateID=\""; String startTagPart_2 = "\" title=\""; String startTagPart_3 = "\" style=\"background:LightBlue;\">"; String endTag = "</span><br>"; AnnotationSet defaultAnnotSet = doc.getAnnotations(); Set annotTypesRequired = new HashSet(); annotTypesRequired.add("ProfessionalActivityCurrent"); annotTypesRequired.add("ProfessionalActivityNoCurrent"); annotTypesRequired.add("AccreditedUniversityStudiesOtherPostGrade"); annotTypesRequired.add("AccreditedUniversityStudiesDegree"); annotTypesRequired.add("AccreditedUniversityStudiesPhDStudies"); annotTypesRequired.add("AgentIdentification"); Set<Annotation> peopleAndPlaces = new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired)); FeatureMap features = doc.getFeatures(); String originalContent = doc.getContent().toString(); //(String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); //RepositioningInfo info = (RepositioningInfo) // features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); String xmlDocument = doc.toXml(peopleAndPlaces, true); String css_code = "<style type=\"text/css\">" + "span.AgentIdentification" + "{" + " background-color: #808080;" + "} " + "span.AccreditedUniversityStudiesPhDStudies" + "{" + " background-color: #FFFFCC;" + "} " + "span.AccreditedUniversityStudiesDegree" + "{" + " background-color: #CCFFCC;" + "} " + "span.AccreditedUniversityStudiesOtherPostGrade" + "{" + " background-color: #C17128;" + "} " + "span.ProfessionalActivityNoCurrent" + "{" + " background-color: #99CCCC;" + "} " + "span.ProfessionalActivityCurrent" + "{" + " background-color: #FF99CC;" + "} " + ".fixed {position:fixed !important; right:0px; top:0px; z-index:10 !important; background-color: #ffffff;} " + "</style>"; String legend = "<div class=\"fixed\">NOTES:<br>"; legend = legend/*from www. j a v a2 s . co m*/ + "|1| = <span class=\"AccreditedUniversityStudiesPhDStudies\">AccreditedUniversityStudiesPhDStudies"; legend = legend + "</span><br>"; legend = legend + "|2| = <span class=\"AccreditedUniversityStudiesDegree\">AccreditedUniversityStudiesDegree"; legend = legend + "</span><br>"; legend = legend + "|3| = <span class=\"AccreditedUniversityStudiesOtherPostGrade\">AccreditedUniversityStudiesOtherPostGrade"; legend = legend + "</span><br>"; legend = legend + "|4| = <span class=\"ProfessionalActivityNoCurrent\">ProfessionalActivityNoCurrent"; legend = legend + "</span><br>"; legend = legend + "|5| = <span class=\"ProfessionalActivityCurrent\">ProfessionalActivityCurrent"; legend = legend + "</span><br>"; legend = legend + "|6| = <span class=\"AgentIdentification\">AgentIdentification"; legend = legend + "</span></div><br><br><br><br><br>"; int index1 = xmlDocument.indexOf("</head>"); if (index1 > 0) { xmlDocument = xmlDocument.replace("</head>", "</head>" + css_code + legend); } else { xmlDocument = css_code + legend + xmlDocument; } { org.jsoup.nodes.Document docjsoup = org.jsoup.Jsoup.parse(xmlDocument); org.jsoup.select.Elements elements = docjsoup.select("AccreditedUniversityStudiesDegree"); if (elements != null) { for (org.jsoup.nodes.Element element : elements) { String s = element.html(); s = s; } } } // xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesPhDStudies","<b>#SP#</b><span class=\"AccreditedUniversityStudiesPhDStudies\""); // xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesPhDStudies>","</span><b>#SP#</b>"); // // xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesDegree","<b>#SD#</b><span class=\"AccreditedUniversityStudiesDegree\""); // xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesDegree>","</span><b>#SD#</b>"); // // xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesOtherPostGrade","<b>#SO#</b><span class=\"AccreditedUniversityStudiesPhDStudies\""); // xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesOtherPostGrade>","</span><b>#SO#</b>"); // // xmlDocument = xmlDocument.replace("<ProfessionalActivityNoCurrent","<b>#</b><span class=\"ProfessionalActivityNoCurrent\""); // xmlDocument = xmlDocument.replace("</ProfessionalActivityNoCurrent>","</span><b>#PN#</b>"); // // xmlDocument = xmlDocument.replace("<ProfessionalActivityCurrent","<b>#</b><span class=\"ProfessionalActivityCurrent\""); // xmlDocument = xmlDocument.replace("</ProfessionalActivityCurrent>","</span><b>#PC#</b>"); // xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesPhDStudies","<span class=\"AccreditedUniversityStudiesPhDStudies\""); // xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesPhDStudies>","</span>"); // // xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesDegree","<span class=\"AccreditedUniversityStudiesDegree\""); // xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesDegree>","</span>"); // // xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesOtherPostGrade","<span class=\"AccreditedUniversityStudiesPhDStudies\""); // xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesOtherPostGrade>","</span>"); // // xmlDocument = xmlDocument.replace("<ProfessionalActivityNoCurrent","<span class=\"ProfessionalActivityNoCurrent\""); // xmlDocument = xmlDocument.replace("</ProfessionalActivityNoCurrent>","</span>"); // // xmlDocument = xmlDocument.replace("<ProfessionalActivityCurrent","<span class=\"ProfessionalActivityCurrent\""); // xmlDocument = xmlDocument.replace("</ProfessionalActivityCurrent>","</span>"); xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesPhDStudies", "<b>|1|</b><span class=\"AccreditedUniversityStudiesPhDStudies\""); xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesPhDStudies>", "</span><b>|1|</b>"); xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesDegree", "<b>|2|</b><span class=\"AccreditedUniversityStudiesDegree\""); xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesDegree>", "</span><b>|2|</b>"); xmlDocument = xmlDocument.replace("<AccreditedUniversityStudiesOtherPostGrade", "<b>|3|</b><span class=\"AccreditedUniversityStudiesPhDStudies\""); xmlDocument = xmlDocument.replace("</AccreditedUniversityStudiesOtherPostGrade>", "</span><b>|3|</b>"); xmlDocument = xmlDocument.replace("<ProfessionalActivityNoCurrent", "<b>|4|</b><span class=\"ProfessionalActivityNoCurrent\""); xmlDocument = xmlDocument.replace("</ProfessionalActivityNoCurrent>", "</span><b>|4|</b>"); xmlDocument = xmlDocument.replace("<ProfessionalActivityCurrent", "<b>|5|</b><span class=\"ProfessionalActivityCurrent\""); xmlDocument = xmlDocument.replace("</ProfessionalActivityCurrent>", "</span><b>|5|</b>"); xmlDocument = xmlDocument.replace("<AgentIdentification", "<b>|6|</b><span class=\"AgentIdentification\""); xmlDocument = xmlDocument.replace("</AgentIdentification>", "</span><b>|6|</b>"); xmlDocument = xmlDocument.replace("\n", "<br>"); try { FileUtils.write(file_result, xmlDocument, "UTF-8"); } catch (IOException ex) { ProjectLogger.LOGGER.error("The verbose file can not be created " + file_result.getPath(), ex); } }
From source file:org.abondar.experimental.eventsearch.EventFinder.java
public String getEventPlaces(String place) { String res = ""; try {//from w w w . j a va 2 s. co m Document placeDoc = Jsoup.connect("https://afisha.yandex.ru" + place).get(); Elements elems = placeDoc.select("p"); for (Element e : elems) { if (e.parents().get(1).html().contains("<div style")) { if (e.children().size() > 1) { if (e.child(1).hasAttr("href")) { res = e.child(1).html() + " ?"; } } else if (e.children().isEmpty()) { res = e.html() + " ?"; } } } } catch (IOException ex) { Logger.getLogger(EventFinder.class.getName()).log(Level.SEVERE, null, ex); } return res; }