me.zhuoran.crawler4j.crawler.config.XmlLoader.java Source code

Introduction

Here is the source code for me.zhuoran.crawler4j.crawler.config.XmlLoader.java
Source

/*
 * Copyright (c) 2012 Zhuoran Wang <zoran.wang@gmail.com>
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package me.zhuoran.crawler4j.crawler.config;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import me.zhuoran.crawler4j.crawler.DefaultWebCrawler;
import me.zhuoran.crawler4j.crawler.WebCrawler;
import me.zhuoran.crawler4j.crawler.parser.Parser;
import me.zhuoran.crawler4j.crawler.util.Reflections;
import me.zhuoran.crawler4j.crawler.util.StringUtils;

import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This class is used to parse crawler4j.xml 
 * 
 * @author Zoran
 *
 */
public class XmlLoader {

    private static Logger logger = LoggerFactory.getLogger(XmlLoader.class);

    private static final String DEFAULT_CONFIG_FILE = "crawler4j.xml";

    private Document document = null;

    /**
     * Load and parse the crawler4j.xml to get all crawlers.
     * @return WebCrawler instance list;
     * @throws Exception
     */
    @SuppressWarnings("unchecked")
    public List<WebCrawler> load() {

        List<WebCrawler> crawlers = new ArrayList<WebCrawler>();
        SAXReader reader = new SAXReader();
        try {
            InputStream is = this.getClass().getClassLoader().getResourceAsStream(DEFAULT_CONFIG_FILE);
            document = reader.read(is);
            Element root = document.getRootElement();
            for (Iterator<Element> i = root.elementIterator(); i.hasNext();) {
                Element taskElement = i.next();
                String name = taskElement.elementText("name");
                long delay = Long.parseLong(taskElement.elementText("delay"));
                String url = taskElement.elementText("url");
                String parserName = taskElement.elementText("parser").trim();
                String defaultCharset = taskElement.elementText("charset");
                String pageNoStr = taskElement.elementText("max_page");
                String crawlerName = taskElement.elementText("crawler");
                String nextPageRegex = taskElement.elementText("next_page_key");
                String extractLinksElementId = taskElement.elementText("extract_links_elementId");
                Class<Parser> c = (Class<Parser>) Class.forName(parserName);
                Parser parser = c.newInstance();

                if (parser == null) {
                    throw new IllegalArgumentException("parser must not be null!");
                }

                int maxPageNo = 1;

                if (StringUtils.isNotEmpty(pageNoStr)) {
                    maxPageNo = Integer.parseInt(pageNoStr);
                }

                CrawlerConfig config = new CrawlerConfig(name, defaultCharset, url, delay, maxPageNo, nextPageRegex,
                        extractLinksElementId, parserName, crawlerName, parser);
                WebCrawler crawler = null;

                //get WebCrawler instance throw reflection
                if (StringUtils.isBlank(crawlerName)) {
                    crawler = new DefaultWebCrawler(config);
                } else {
                    crawler = (WebCrawler) Reflections.constructorNewInstance(crawlerName,
                            new Class[] { CrawlerConfig.class }, new CrawlerConfig[] { config });
                }
                crawlers.add(crawler);
            }
        } catch (Throwable e) {
            logger.error(e.getMessage(), e);
        }

        return crawlers;
    }

}