com.screenslicer.core.scrape.trainer.TrainerVisitorExtract.java Source code

Java tutorial

Introduction

Here is the source code for com.screenslicer.core.scrape.trainer.TrainerVisitorExtract.java

Source

/* 
 * ScreenSlicer (TM) -- automatic, zero-config web scraping (TM)
 * Copyright (C) 2013-2014 Machine Publishers, LLC
 * ops@machinepublishers.com | screenslicer.com | machinepublishers.com
 * 717 Martin Luther King Dr W Ste I, Cincinnati, Ohio 45220
 *
 * You can redistribute this program and/or modify it under the terms of the
 * GNU Affero General Public License version 3 as published by the Free
 * Software Foundation. Additional permissions or commercial licensing may be
 * available--see LICENSE file or contact Machine Publishers, LLC for details.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License version 3
 * for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License
 * version 3 along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * For general details about how to investigate and report license violations,
 * please see: https://www.gnu.org/licenses/gpl-violation.html
 * and email the author: ops@machinepublishers.com
 * Keep in mind that paying customers have more rights than the AGPL alone offers.
 */
package com.screenslicer.core.scrape.trainer;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Element;

import com.screenslicer.core.scrape.Extract;
import com.screenslicer.core.scrape.type.ComparableNode;

public class TrainerVisitorExtract implements TrainerExtract.Visitor {
    private ArrayList<String> resultParents = new ArrayList<String>();
    private ArrayList<Element> elements = new ArrayList<Element>();
    private final Map<Element, ComparableNode[]> nodes = new HashMap<Element, ComparableNode[]>();
    private Object[] tmpItems;
    private int curItem;
    private static final ItemSort sorter = new ItemSort();
    private Map<ComparableNode, Integer> indices = new HashMap<ComparableNode, Integer>();

    @Override
    public void init() {
        final ArrayList<String> filenames = new ArrayList<String>();
        final List<String> bump = Arrays.asList(new String[] {});
        new File("./test/data-webpages/").listFiles(new FileFilter() {
            @Override
            public boolean accept(File file) {
                if (!file.getAbsolutePath().endsWith("-success") && !file.getAbsolutePath().endsWith("-successnode")
                        && !file.getAbsolutePath().endsWith("-result") && !file.getAbsolutePath().endsWith("-num")
                        && !file.getAbsolutePath().endsWith("-next")) {
                    try {
                        if (bump.contains(file.getName())) {
                            resultParents.add(0, FileUtils
                                    .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8")));
                            elements.add(0, DataUtil.load(file, "utf-8", "http://localhost").body());
                            filenames.add(0, file.getName());
                        } else {
                            resultParents.add(FileUtils
                                    .readFileToString(new File(file.getAbsolutePath() + "-success", "utf-8")));
                            elements.add(DataUtil.load(file, "utf-8", "http://localhost").body());
                            filenames.add(file.getName());
                        }
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }
                return false;
            }
        });
        for (String filename : filenames) {
            System.out.println(filename);
        }
        tmpItems = new Object[resultParents.size()];
    }

    @Override
    public void reload(boolean sort) {
        if (sort) {
            for (int i = curItem; i < tmpItems.length; i++) {
                tmpItems[i] = new Object[] { 0, elements.get(i), resultParents.get(i) };
            }
            Arrays.sort(tmpItems, sorter);
            elements.clear();
            resultParents.clear();
            for (int i = 0; i < tmpItems.length; i++) {
                elements.add((Element) ((Object[]) tmpItems[i])[1]);
                resultParents.add((String) ((Object[]) tmpItems[i])[2]);
            }
        }
        curItem = 0;
    }

    private static class ItemSort implements Comparator<Object> {
        @Override
        public int compare(Object o1, Object o2) {
            Integer lhs = (Integer) ((Object[]) o1)[0];
            Integer rhs = (Integer) ((Object[]) o2)[0];
            return rhs.compareTo(lhs);
        }
    }

    @Override
    public int visitNext(int page) {
        ComparableNode[] targetsArray = nodes.get(elements.get(curItem));
        if (targetsArray == null) {
            ComparableNode[] comparableNodes = Extract.trainInit(elements.get(curItem), page);
            List<ComparableNode> targets = new ArrayList<ComparableNode>();
            for (int i = 0; i < comparableNodes.length; i++) {
                if (comparableNodes[i].node().outerHtml().startsWith(resultParents.get(curItem))) {
                    targets.add(comparableNodes[i]);
                }
            }
            targetsArray = targets.toArray(new ComparableNode[0]);
            for (int i = 0; i < comparableNodes.length; i++) {
                for (int j = 0; j < targetsArray.length; j++) {
                    if (comparableNodes[i].equals(targetsArray[j])) {
                        indices.put(targetsArray[j], i);
                    }
                }
            }
            nodes.put(elements.get(curItem), targetsArray);
        }
        int score = Integer.MAX_VALUE;
        for (int i = 0; i < targetsArray.length && score != 0; i++) {
            int curScore = Extract.train(elements.get(curItem), page, targetsArray[i],
                    indices.get(targetsArray[i]));
            if (curScore < score) {
                score = curScore;
            }
        }
        tmpItems[curItem] = new Object[] { score, elements.get(curItem), resultParents.get(curItem) };
        ++curItem;
        return score;
    }

    @Override
    public int trainingDataSize() {
        return resultParents.size();
    }
}