Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.aliuge.crawler.jobconf; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Pattern; import org.aliuge.crawler.exception.ConfigurationException; import org.aliuge.crawler.exception.ExtractException; import org.aliuge.crawler.extractor.selector.AbstractElementCssSelector; import org.aliuge.crawler.extractor.selector.IFConditions; import org.aliuge.crawler.extractor.selector.factory.ElementCssSelectorFactory; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.google.common.collect.Lists; import com.google.common.collect.Maps; /** * @author * @date 201481 * @desc ?? */ public class ExtractConfig extends Configuration { /** * ???? */ private int threadNum = 10; /** * ??? */ private final List<ExtractTemplate> templates = Lists.newArrayList(); /** * ?????? * @param document * @return * @throws Exception */ public Map<String, Object> getContentAll(Document document) throws ExtractException { Map<String, Object> content = Maps.newHashMap(); for (ExtractTemplate template : templates) { Map<String, Object> m = template.getContentResultMap(document); if (m != null && m.size() > 0) content.putAll(m); } return content; } /** * ???????<br> * ?get(???????????) * @param document * @param url * @return * @throws ExtractException */ public Map<String, Object> getContentSeprator(Document document, String url) throws ExtractException { Map<String, Object> content = Maps.newHashMap(); for (ExtractTemplate template : templates) { if (template.urlFilter(url)) { Map<String, Object> m = template.getContentResultMap(document); if (m != null && m.size() > 0) { content.put(super.getIndexName(), m); return content; } } } return content; } /** * ???? * @param doc * @return * @throws ConfigurationException */ public ExtractConfig loadConfig(Document doc) { Elements extractElement = doc.select("extract"); super.setJobName(doc.select("job").attr("name")); super.setIndexName(doc.select("job").attr("indexName")); String temp = extractElement.select("threadNum").text(); if (StringUtils.isNotBlank(temp)) { this.threadNum = Integer.parseInt(temp); } Elements templateElement = extractElement.select("extract").select("template"); Iterator<Element> it = templateElement.iterator(); while (it.hasNext()) { Element template = it.next(); ExtractTemplate extractTemplate = new ExtractTemplate(); // ?Url???? Elements urlPatternElement = template.select("url"); List<Pattern> patterns = Lists.newArrayList(); for (Element urlElement : urlPatternElement) { patterns.add(Pattern.compile(urlElement.text())); } extractTemplate.setUrlPattern(patterns); extractTemplate.setName(template.attr("name")); // ??? Elements selectElement = template.select("elements").first().children(); for (Element element : selectElement) { if ("element".equals(element.tagName())) { AbstractElementCssSelector<?> selector = ElementCssSelectorFactory.create(element); extractTemplate.addCssSelector(selector); } else if ("if".equals(element.tagName())) { IFConditions ifConditions = IFConditions.create(element); extractTemplate.addConditions(ifConditions); } } super.setExtractConfig(this); this.templates.add(extractTemplate); } //super.setExtractConfig(this); return this; } public int getThreadNum() { return threadNum; } public void setThreadNum(int threadNum) { this.threadNum = threadNum; } public List<ExtractTemplate> getTemplates() { return templates; } @Override public String toString() { final int maxLen = 10; StringBuilder builder = new StringBuilder(); builder.append("ExtractConfig [threadNum=").append(threadNum).append(", templates=") .append(templates != null ? templates.subList(0, Math.min(templates.size(), maxLen)) : null) .append(", jobName=").append(getJobName()).append("]"); return builder.toString(); } }