List of usage examples for org.jsoup.nodes Element attr
public String attr(String attributeKey)
From source file:no.kantega.publishing.admin.content.htmlfilter.ImgHeightAndWidthFilter.java
@Override public Document runFilter(Document document) { if (multimediaDao == null) { ApplicationContext context = RootContext.getInstance(); multimediaDao = context.getBean(MultimediaDao.class); imageEditor = context.getBean(ImageEditor.class); }// w ww . j ava 2s. c o m for (Element img : document.getElementsByTag("img")) { String width = img.attr("width"); String height = img.attr("height"); if (isNotBlank(width) && isNoneBlank(height)) { try { int imageWidth = Integer.parseInt(width); int imageHeight = Integer.parseInt(height); String url = img.attr("src"); if (url != null) { List<Integer> ids = MultimediaHelper.getMultimediaIdsFromText(url); if (ids.size() == 1) { int multimediaId = ids.get(0); Multimedia image = multimediaDao.getMultimedia(multimediaId); if (imageWidth != image.getWidth() || imageHeight != image.getHeight()) { MultimediaDimensions d = imageEditor.getResizedImageDimensions(image.getWidth(), image.getHeight(), imageWidth, imageHeight); img.attr("height", String.valueOf(d.getHeight())); img.attr("width", String.valueOf(d.getWidth())); String imageUrl = image.getUrl(); img.attr("src", imageUrl + (imageUrl.contains("?") ? "&" : "?") + "width=" + d.getWidth()); } } } } catch (NumberFormatException e) { log.error("Could not parse number", e); } } } return document; }
From source file:no.kantega.publishing.admin.content.htmlfilter.ReplaceStyleAlignWithAttributeAlignFilter.java
@Override public Document runFilter(Document document) { for (String tag : tags) { for (Element element : document.getElementsByTag(tag)) { String style = element.attr("style"); if (isNotBlank(style)) { if (style.contains("right")) { element.attr("align", "right"); } else if (style.contains("left")) { element.attr("align", "left"); } else if (style.contains("center")) { element.attr("align", "center"); }//from ww w . ja v a 2 s .c o m element.removeAttr("style"); } } } return document; }
From source file:no.kantega.publishing.modules.linkcheck.crawl.LinkExtractor.java
private void handleAttribute(Content content, LinkHandler linkHandler, Attribute attribute) { String attrName = (isNotBlank(attribute.getTitle())) ? attribute.getTitle() : attribute.getName(); if (attribute instanceof HtmltextAttribute) { String html = attribute.getValue(); try {//from ww w . j ava2 s . c o m if (html != null) { Elements links = Jsoup.parse(html).select("a[href]"); for (Element link : links) { String href = link.attr("href"); linkHandler.attributeLinkFound(content, href, attrName); } } } catch (Throwable e) { eventLog.log("LinkExtractor", "localhost", Event.FAILED_LINK_EXTRACT, String.format("Failed to extract links from %s", content.getUrl()), content); log.error("contentId: {}, associationid: {}, attribute: {} {}", content.getId(), content.getAssociation().getId(), attrName, html); } } else if (attribute instanceof UrlAttribute) { String link = attribute.getValue(); if (link != null && link.length() > 0) { if (link.startsWith("/")) { link = Aksess.VAR_WEB + link; } linkHandler.attributeLinkFound(content, link, attrName); } } else if (attribute instanceof FileAttribute && isNotBlank(attribute.getValue())) { try { int attachmentId = Integer.parseInt(attribute.getValue()); String link = Aksess.VAR_WEB + "/attachment.ap?id=" + attachmentId; linkHandler.attributeLinkFound(content, link, attrName); } catch (Exception e) { log.error("Error getting Content({}) FileAttribute {} with value {}", content.getId(), attribute.getName(), attribute.getValue()); } } else if (attribute instanceof MediaAttribute && isNotBlank(attribute.getValue())) { try { int mediaId = Integer.parseInt(attribute.getValue()); String link = Aksess.VAR_WEB + "/multimedia.ap?id=" + mediaId; linkHandler.attributeLinkFound(content, link, attrName); } catch (Exception e) { log.error("Error getting Content({}) FileAttribute {} with value {}", content.getId(), attribute.getName(), attribute.getValue()); } } else if (attribute instanceof RepeaterAttribute) { RepeaterAttribute repeaterAttribute = (RepeaterAttribute) attribute; for (List<Attribute> attributes : repeaterAttribute) { for (Attribute a : attributes) { handleAttribute(content, linkHandler, a); } } } }
From source file:noThreads.Menu.java
public void createMenu() throws IOException, InterruptedException { Document doc = null;//from w ww. j a v a 2 s . c o m BufferedReader br = null; System.out.print("******************** Menu Options ******************** " + "\n1. Get a playlist for all the stations at <e-radio.gr>" + "\n2. View the available station Categories and get a playlist." + "\n3. View the available station Locations and get a playlist." + "\n4. View the station Ratings (Top) and get a playlist." + "\n5. Exit." + "\n\n" + "Please make a choice (1-5): "); br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } switch (choice) { case (1): //GET all the e-radio location links (in order to get all the links) doc = parseUrl(URL, 0); if (doc == null) { print("No connection to the server! Exiting..."); System.exit(1); } Elements links = doc.select("div[id=paneContainer]").select("a[href*=/locations/]"); for (Element link : links) theUrls.add(link.attr("abs:href")); System.out.println("...Processing <All e-radio> station links"); break; case (2): //Get CATEGORIES doc = parseUrl(URL, 0); if (doc == null) { print("No connection to the server! Exiting..."); System.exit(1); } Elements categoryLinks = doc.select("div[id=paneContainer]").select("a[href*=/categories/]"); System.out.println("E-radio stations available categories: " + "\n"); for (int i = 0; i < categoryLinks.size(); i++) { System.out.println(i + 1 + ". " + StringEscapeUtils.unescapeHtml4(categoryLinks.get(i).html())); } System.out.print("\n" + "Please make a choise (1-" + categoryLinks.size() + "): "); br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } if (choice <= categoryLinks.size() && choice >= 1) { theUrls.add(categoryLinks.get(choice - 1).attr("abs:href")); System.out.println("...Processing the <" + StringEscapeUtils.unescapeHtml4(categoryLinks.get(choice - 1).html()) + "> category"); } else { System.out.println("Wrong selection..."); System.out.println("Exiting program..."); System.exit(1); } break; case (3)://Get LOCATIONS doc = parseUrl(URL, 0); if (doc == null) { print("No connection to the server! Exiting..."); System.exit(1); } Elements locationLinks = doc.select("div[id=paneContainer]").select("a[href*=/locations/]"); System.out.println("E-radio stations available locations: " + "\n"); for (int i = 0; i < locationLinks.size(); i++) { System.out.println(i + 1 + ". " + StringEscapeUtils.unescapeHtml4(locationLinks.get(i).html())); } System.out.print("\n" + "Please make a choise (1-" + locationLinks.size() + "): "); br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } if (choice <= locationLinks.size() && choice >= 1) { theUrls.add(locationLinks.get(choice - 1).attr("abs:href")); System.out.println("...Processing <" + StringEscapeUtils.unescapeHtml4(locationLinks.get(choice - 1).html()) + "> locatino"); } else { System.out.println("Wrong selection!"); System.out.println("Exiting program..."); System.exit(1); } break; case (4): final int YEARLY_RATING = 10; doc = parseUrl(URL, 0); if (doc == null) { print("No connection to the server! Exiting..."); System.exit(1); } Elements ratingsMenu = doc.select("div[class=menuFly]").select("li").select("a[class=hide]"); print("\nStations ratings: \n"); for (int i = 0; i < ratingsMenu.size(); i++) { System.out.println(i + 1 + ". " + StringEscapeUtils.unescapeHtml4(ratingsMenu.get(i).html())); } System.out.print("\n" + "Please make a choise (1-" + ratingsMenu.size() + "): "); br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } /* * The html of the Ratings menu processed * has this structure: * <div> * <ul> * <li> * <ul> * ... * </ul> * </li> * ... * </ul> * </div> */ if (choice <= ratingsMenu.size() && choice >= 1) { //Get the DIV element with class "menuFly" Elements div = doc.select("div[class=menuFly]"); //div Elements list has only one element. So get the children of div Elements ul = div.get(0).children(); //ul Elements list has only one element. So get the children of ul Elements li = ul.get(0).children(); //remove blank elements for (int j = 0; j < li.size(); j++) { if (li.get(j).hasText() == false) li.remove(li.get(j)); } //get the title of user choice and print it out print("\n%s", StringEscapeUtils.unescapeHtml4(ratingsMenu.get(choice - 1).html()) + "\n"); //check if there is a sub-menu Elements ulTag = li.get(choice - 1).select("ul"); if (ulTag.hasText() == true) { Elements subMenu = ulTag.select("li").select("a[href]"); //print the sub-menu for (int j = 0; j < subMenu.size(); j++) print("%s. %s ", j + 1, StringEscapeUtils.unescapeHtml4(subMenu.get(j).html())); System.out.print("\n" + "Please make a choise (1-" + subMenu.size() + "): "); //read user input br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } if (choice <= subMenu.size() && choice >= 1) { theUrls.add(subMenu.get(choice - 1).attr("abs:href")); System.out.println("...Processing the <" + StringEscapeUtils.unescapeHtml4(subMenu.get(choice - 1).html()) + "> category"); } else { System.out.println("Wrong selection!"); System.out.println("Exiting program..."); System.exit(1); } } else { if (choice == YEARLY_RATING) { String url = li.get(choice - 1).select("a[href").attr("abs:href"); doc = parseUrl(url, 0); if (doc != null) { Elements yearTopSubMenu = doc.select("div[id=maintabsid]").select("a[href]"); //print the sub-menu for (int i = 0; i < yearTopSubMenu.size(); i++) print("%s. %s", i + 1, StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(i).html())); System.out.print("\n" + "Please make a choise (1-" + yearTopSubMenu.size() + "): "); //read user input br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } if (choice <= yearTopSubMenu.size() && choice >= 1) { if (choice == 1) { theUrls.add(yearTopSubMenu.get(choice - 1).attr("abs:href")); print("...Processing the <" + StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(choice - 1).html()) + "> category"); } else if (choice == 2) { String link = yearTopSubMenu.get(choice - 1).attr("abs:href"); doc = parseUrl(link, 0); //print menu title print("\n%s", StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(choice - 1).html()) + "\n"); if (doc != null) { Elements elem = doc.select("select[id=selectoption]") .select("option[value]"); ArrayList<Integer> nums = new ArrayList<Integer>(); for (int i = 0; i < elem.size(); i++) { //get the select category values and print the sub-menu int num = Integer.parseInt(elem.get(i).attr("value")); //add them to list nums.add(num); print("%s. %s", i + 1, StringEscapeUtils.unescapeHtml4( elem.get(i).html().replace("Select category: ", ""))); } System.out.print("\n" + "Please make a choise (1-" + elem.size() + "): "); //read user input br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } if (choice <= elem.size() && choice >= 1) { int num = nums.get(choice - 1); String added = "max=100&id=" + num + "&"; String newlink = link.replace("max=100&", added); //print("\nlink: %s", newlink); DEBUG print theUrls.add(newlink); System.out .println("...Processing the <" + StringEscapeUtils.unescapeHtml4(elem.get(choice - 1) .html().replace("Select category: ", "")) + "> category"); print(elem.get(choice - 1).select("a[href]").attr("abs:href")); } else { System.out.println("Wrong selection!"); System.out.println("Exiting program..."); System.exit(1); } } else { System.out.println("ERROR: Cannot get links from server!"); System.out.println("Exiting program..."); System.exit(1); } } else { String link = yearTopSubMenu.get(choice - 1).attr("abs:href"); doc = parseUrl(link, 0); //print menu title print("\n%s", StringEscapeUtils.unescapeHtml4(yearTopSubMenu.get(choice - 1).html()) + "\n"); if (doc != null) { Elements elem = doc.select("select[id=selectoption]") .select("option[value]"); ArrayList<Integer> nums = new ArrayList<Integer>(); for (int i = 0; i < elem.size(); i++) { //get the select category values and print the sub-menu int num = Integer.parseInt(elem.get(i).attr("value")); //add them to list nums.add(num); print("%s. %s", i + 1, StringEscapeUtils.unescapeHtml4( elem.get(i).html().replace("Select location: ", ""))); } System.out.print("\n" + "Please make a choise (1-" + elem.size() + "): "); //read user input br = new BufferedReader(new InputStreamReader(System.in)); try { choice = Integer.parseInt(br.readLine()); } catch (IOException e) { System.out.println("Error!"); System.exit(1); } if (choice <= elem.size() && choice >= 1) { int num = nums.get(choice - 1); String[] linkParts = link.split("&", 4); String finalLink = linkParts[0] + "&" + linkParts[1] + "&" + "id=" + num + "&" + linkParts[3]; //print("\nlink: %s \n link2: %s \n link3: %s \n link: %s \nsize: %s", linkParts[0], linkParts[1], linkParts[2], linkParts[3], linkParts.length); // DEBUG print //print(finalLink); theUrls.add(finalLink); System.out .println("...Processing the <" + StringEscapeUtils.unescapeHtml4(elem.get(choice - 1) .html().replace("Select category: ", "")) + "> category"); print(elem.get(choice - 1).select("a[href]").attr("abs:href")); } else { System.out.println("Wrong selection!"); System.out.println("Exiting program..."); System.exit(1); } } else { System.out.println("ERROR: Cannot get links from server!"); System.out.println("Exiting program..."); System.exit(1); } } } else { System.out.println("Wrong selection!"); System.out.println("Exiting program..."); System.exit(1); } } else { System.out.println("ERROR: Cannot get links from server!"); System.out.println("Exiting program..."); System.exit(1); } } else { theUrls.add(li.get(choice - 1).select("a[href").attr("abs:href")); System.out.println("...Processing the <" + StringEscapeUtils.unescapeHtml4(ratingsMenu.get(choice - 1).html()) + "> category"); print(li.get(choice - 1).select("a[href]").attr("abs:href")); } } } else { System.out.println("Wrong selection!"); System.out.println("Exiting program..."); System.exit(1); } break; case (5): System.out.println("Exiting program..."); System.exit(0); break; default: System.out.println("Invalid choice! Exiting..."); System.exit(1); break; } }
From source file:noThreads.ParseLevel2.java
/** * * @param theLinks/* ww w . ja v a2 s. co m*/ * @throws IOException */ public void getSecondLinks(ArrayList<String> theLinks) throws IOException { float num = 0; String temp, attrOfScr, subString; Document doc; boolean flag; for (String sLink : theLinks) { if ((sLink.endsWith(".asx") == true) || (sLink.endsWith(".swf") == true)) { stationLinks2.add(sLink); print("Written to file: %s", sLink); } else { //iframeCase(sLink); doc = parseUrl(sLink, 0); if (doc != null) { Elements media = doc.select("[src]"); print("Fetching %s --> ", sLink); flag = false; for (Element src : media) { if (src.tagName().equals("embed") == true) { flag = true; temp = src.attr("abs:src"); if (temp.endsWith(".swf") == true) { attrOfScr = src.attr("abs:flashvars"); // System.out.println("\nThis is src of embed tag: " // +temp // +"\nThis is attribute flashvars of embed tag: " // +attrOfScr); int start = attrOfScr.indexOf("http://", attrOfScr.indexOf("http://") + 1); int end = attrOfScr.indexOf("&"); char a_char = attrOfScr.charAt(end - 1); if (start != -1 && end != -1) { if (a_char == ';') { subString = attrOfScr.substring(start, end - 1); } else { subString = attrOfScr.substring(start, end); } //System.out.println("\nthis is the result subString: "+subString); stationLinks2.add(subString); } else { //something's wrong, do not process the link flag = false; } break;//link found } stationLinks2.add(temp); break;//link found, load next url } } //end nested for if (flag == false) {//the code has no embed tag stationLinks2.add(sLink); } } } num = (float) (theLinks.indexOf(sLink)) / (float) (theLinks.size()) * WEIGHT_IN_COMPUTATION + curProgress.getCurProgressPart1(); curProgress.setCurProgress((int) num); } //end outer for writeLinksToFile(links2FileName, stationLinks2); print("Written %s to file, second links.", stationLinks2.size()); }
From source file:org.aliuge.crawler.extractor.selector.AbstractElementCssSelector.java
/** * ??????/*from w w w. j a v a2 s. c o m*/ * @param elements * @param attr * @return */ protected String getExtractAttr(Elements elements, String attr) { String temp = ""; if (attr.equalsIgnoreCase("tostring")) { return temp = elements.attr(attr).toString(); } else { if (index == -1 && StringUtils.isNotBlank(this.regex)) { for (Element e : elements) { Element element = e; if (element.select(this.regex).size() > 0) { return temp = e.attr(attr); } } return temp; } else { if (index > -1 && index < elements.size()) { return elements.get(index).attr(attr); } } return elements.first().attr(attr); } /*if(null!=pattern){ Matcher m = pattern.matcher(temp); if(m.find()){ temp = m.group(1); } }*/ //return temp; }
From source file:org.aliuge.crawler.extractor.selector.action.ActionFactory.java
@SuppressWarnings("unchecked") public static SelectorAction create(Element element, String c) { if ("string".equals(c)) { StringActionType $type = EnumUtils.getEnum(StringActionType.class, element.attr("operation")); if (null == $type) { try { throw new Exception( "?" + element.tagName() + "operation"); } catch (Exception e) { e.printStackTrace();/* w ww . ja v a 2 s . com*/ } } switch ($type) { case after: return new StringAfterAction(element.attr("split")); case afterLast: return new StringAfterLastAction(element.attr("split")); case before: return new StringBeforeAction(element.attr("split")); case beforeLast: return new StringBeforeLastAction(element.attr("split")); case between: return new StringBetweenAction(element.attr("exp")); case filter: return new StringFilterAction(element.attr("filter"), element.attr("charType")); case replace: /* * return new * StringReplaceAction(element.attr("exp"),element.attr * ("replacement")); */ String exp = element.attr("exp"); String[] kv = exp.split(","); if (kv.length == 2) { return new StringReplaceAction(kv[0], kv[1]); } case split: return new StringSplitAction(element.attr("split"), element.attr("index")); case sub: return new StringSubAction(element.attr("exp")); case suffix: return new StringSuffixAction(element.attr("suffix")); case perfix: return new StringPerfixAction(element.attr("perfix")); case regex: return new StringRegexAction(element.attr("exp")); default: break; } } else if ("integer".equals(c) || "int".equals(c)) { IntegerActionType $type = EnumUtils.getEnum(IntegerActionType.class, element.attr("operation")); switch ($type) { case abs: return new IntegerAbsAction(); case between: try { return new IntegerBetweenAction(element.attr("exp"), element.attr("default")); } catch (IntegerBetweenExpressionException e) { e.printStackTrace(); } default: break; } } else if ("date".equals(c)) { } else if ("numerica".equals(c)) { IntegerActionType $type = EnumUtils.getEnum(IntegerActionType.class, element.attr("operation")); switch ($type) { case abs: return new IntegerAbsAction(); case between: try { return new IntegerBetweenAction(element.attr("exp"), element.attr("default")); } catch (Exception e) { e.printStackTrace(); } default: break; } } else if ("file".equals(c)) { FileActionType $type = EnumUtils.getEnum(FileActionType.class, element.attr("operation")); switch ($type) { case download: String dir = element.attr("dir"); String temp = element.attr("fileName"); boolean md5File = false, asyn; if (StringUtils.isNotBlank(temp)) { if ("{md5}".equals(temp)) { md5File = true; } } else md5File = true; temp = element.attr("asyn"); if (StringUtils.isNotBlank(temp)) { asyn = Boolean.parseBoolean(temp); } else { asyn = true; } return new DownLoadFileAction(dir, md5File, asyn); case download_resize: String dir2 = element.attr("dir"); String temp2 = element.attr("fileName"); boolean md5File2 = false, asyn2; if (StringUtils.isNotBlank(temp2)) { if ("{md5}".equals(temp2)) { md5File2 = true; } } else md5File2 = true; temp2 = element.attr("asyn"); if (StringUtils.isNotBlank(temp2)) { asyn2 = Boolean.parseBoolean(temp2); } else { asyn2 = true; } DownLoadImageResizeAction resizeAction = new DownLoadImageResizeAction(dir2, md5File2, asyn2); temp2 = element.attr("width"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setW(Integer.parseInt(temp2)); } temp2 = element.attr("height"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setH(Integer.parseInt(temp2)); } temp2 = element.attr("quality"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setQuality(Float.parseFloat(temp2)); } temp2 = element.attr("del"); if (StringUtils.isNotBlank(temp2)) { resizeAction.setDeleteOldFile(Boolean.parseBoolean(temp2)); } return resizeAction; default: break; } } else { StringActionType $type = EnumUtils.getEnum(StringActionType.class, element.attr("operation")); if (null == $type) { try { throw new Exception( "?" + element.tagName() + "operation"); } catch (Exception e) { e.printStackTrace(); } } switch ($type) { case after: return new StringAfterAction(element.attr("split")); case afterLast: return new StringAfterLastAction(element.attr("split")); case before: return new StringBeforeAction(element.attr("split")); case beforeLast: return new StringBeforeLastAction(element.attr("split")); case between: return new StringBetweenAction(element.attr("exp")); case filter: return new StringFilterAction(element.attr("filter"), element.attr("charType")); case replace: return new StringReplaceAction(element.attr("search"), element.attr("replacement")); case split: return new StringSplitAction(element.attr("split"), element.attr("index")); case sub: return new StringSubAction(element.attr("exp")); case suffix: return new StringSuffixAction(element.attr("suffix")); case perfix: return new StringPerfixAction(element.attr("perfix")); default: break; } } return null; }
From source file:org.aliuge.crawler.extractor.selector.factory.ElementCssSelectorFactory.java
/** * <b>Element??Element??select/*from www. j a v a 2 s . c om*/ * @param element * @return */ @SuppressWarnings("unchecked") public static AbstractElementCssSelector create(Element element) { String name = element.attr("name"); String value = element.attr("value"); String type = element.attr("type"); String attr = element.attr("attr"); String pattern = element.attr("pattern"); String regex = element.attr("regex"); String required = element.attr("required"); String sIndex = element.attr("index"); boolean isRequired = false; if (StringUtils.isNotBlank(required)) { isRequired = Boolean.parseBoolean(required); } int index = -1; if (StringUtils.isNotBlank(sIndex)) { index = Integer.parseInt(sIndex) - 1; } AbstractElementCssSelector selector = ElementCssSelectorFactory.create(name, type, value, attr, isRequired, index, regex, pattern); // ? Elements children = element.children(); for (Element e : children) { if ("action".equals(e.tagName())) { SelectorAction action = ActionFactory.create(e, element.attr("type")); if (action != null) selector.addAction(action); } // ?Url else if ("element".equals(e.tagName())) { ((PageElementSelector) selector).addSelector(create(e)); } } return selector; }
From source file:org.aliuge.crawler.extractor.selector.IFConditions.java
/** * ?<b>elementIf?</br>//from w w w . j av a 2 s . co m * ?? * @param document * @return */ public static IFConditions create(Element element) { if (element != null) { String exp = element.attr("test"); IFConditions iFconditions = new IFConditions(exp); Elements selectElements = element.children(); for (Element e : selectElements) { if (e.tagName().equals("element")) { iFconditions.addSelector(ElementCssSelectorFactory.create(e)); } } return iFconditions; } return null; }
From source file:org.aliuge.crawler.jobconf.ExtractConfig.java
/** * ????//from w ww . j a v a 2 s . com * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) { Elements extractElement = doc.select("extract"); super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } super.setExtractConfig(this); this.templates.add(extractTemplate); } //super.setExtractConfig(this); return this; }