Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.aliuge.crawler.extractor.selector; import java.util.List; import java.util.Map; import org.aliuge.crawler.exception.ExtractException; import org.aliuge.crawler.extractor.selector.action.SelectorAction; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public abstract class AbstractElementCssSelector<T> { /** * ?? */ protected String name; /** * css selector */ protected String value; /** * img?src?text */ protected String attr; /** * attrSelectorAttr */ protected SelectorAttr $Attr; /** * ?required */ protected boolean isRequired = false; protected String regex; //protected Pattern pattern = null; /** * ?? */ protected Document document; /** * true??document<br> * falsedocument?? */ protected boolean newDoc = true; /** * ???. */ protected int index = -1; /** * elementaction */ protected List<SelectorAction<T>> actions; /** * */ public AbstractElementCssSelector() { }; /** * * @param name * @param value * @param atrr * @param isRequired * @param document */ public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index, String regex) { super(); this.name = name; this.value = value; this.attr = attr; this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr); if (this.$Attr == null) { this.$Attr = SelectorAttr.other; } this.isRequired = isRequired; this.index = index; this.regex = regex; /*if(StringUtils.isNotBlank(regex)) this.pattern = Pattern.compile(regex);*/ } /** * @param name * @param value * @param attr * @param isRequired * @param index */ public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index) { super(); this.name = name; this.value = value; this.attr = attr; this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr); this.isRequired = isRequired; this.index = index; } /** * * @param name * @param value * @param attr * @param isRequired */ public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired) { super(); this.name = name; this.value = value; this.attr = attr; this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr); this.isRequired = isRequired; } /** * ?? * @return */ public abstract T getContent() throws ExtractException; /** * ??k???namevalue? * @return */ public abstract Map<String, T> getContentMap() throws ExtractException; public abstract void addAction(SelectorAction<T> action); public String getName() { return name; } @SuppressWarnings("unchecked") public AbstractElementCssSelector setName(String name) { this.name = name; return this; } public int getIndex() { return index; } public void setIndex(int index) { this.index = index; } public String getValue() { return value; } @SuppressWarnings("unchecked") public AbstractElementCssSelector setValue(String value) { this.value = value; return this; } public String getAttr() { return attr; } @SuppressWarnings("unchecked") public AbstractElementCssSelector setAttr(String attr) { this.attr = attr; return this; } public boolean isRequired() { return isRequired; } @SuppressWarnings("unchecked") public AbstractElementCssSelector setRequired(boolean isRequired) { this.isRequired = isRequired; return this; } public Document getDocument() { return document; } @SuppressWarnings("unchecked") public AbstractElementCssSelector setDocument(Document document) { this.document = document; this.newDoc = true; return this; } public SelectorAttr get$Attr() { return $Attr; } @SuppressWarnings("unchecked") public AbstractElementCssSelector set$Attr(SelectorAttr $Attr) { this.$Attr = $Attr; return this; } @SuppressWarnings("unchecked") public AbstractElementCssSelector setNewDoc(boolean newDoc) { this.newDoc = newDoc; return this; } /** * ????document? */ protected void isNewDoc() { this.newDoc = true; } /** * ???? * @param elements * @return */ protected String getExtractText(Elements elements) { if (elements.size() == 0) return null; String temp = ""; if (attr.equalsIgnoreCase("tostring")) { return temp = elements.toString(); } else { if (index == -1 && StringUtils.isNotBlank(this.regex)) { for (Element e : elements) { Element element = e; if (element.select(this.regex).size() > 0) { return temp = e.text(); } } return temp; } else { if (index > -1 && index < elements.size()) { return elements.get(index).text(); } } return elements.first().text(); } /*if(attr.equals("tostring")){ if(index==0 || index>elements.size()) temp = elements.first().toString(); else temp = elements.get(index).toString(); }else{ if(index==0 || index>elements.size()) temp = elements.first().text(); else temp = elements.get(index).text(); } if(null!=pattern){ Matcher m = pattern.matcher(temp); if(m.find()){ temp = m.group(1); } }*/ //return temp; } /** * ?????? * @param elements * @param attr * @return */ protected String getExtractAttr(Elements elements, String attr) { String temp = ""; if (attr.equalsIgnoreCase("tostring")) { return temp = elements.attr(attr).toString(); } else { if (index == -1 && StringUtils.isNotBlank(this.regex)) { for (Element e : elements) { Element element = e; if (element.select(this.regex).size() > 0) { return temp = e.attr(attr); } } return temp; } else { if (index > -1 && index < elements.size()) { return elements.get(index).attr(attr); } } return elements.first().attr(attr); } /*if(null!=pattern){ Matcher m = pattern.matcher(temp); if(m.find()){ temp = m.group(1); } }*/ //return temp; } public String getRegex() { return regex; } public void setRegex(String regex) { this.regex = regex; } @Override public String toString() { return "AbstractElementCssSelector [name=" + name + ", value=" + value + ", attr=" + attr + ", $Attr=" + $Attr + ", isRequired=" + isRequired + ", regex=" + regex + ", newDoc=" + newDoc + ", index=" + index + "]"; } }