im.delight.imagescraper.ImageScraper.java Source code

Introduction

Here is the source code for im.delight.imagescraper.ImageScraper.java
Source

package im.delight.imagescraper;

/**
 * Copyright 2013 www.delight.im <info@delight.im>
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.concurrent.PriorityBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.util.EntityUtils;

/** Extracts all image files from a given web page and returns as many of the largest files' URLs as requested */
public class ImageScraper extends Thread implements ImageCheckerCallback {

    /** The maximum file size in bytes that an image file may have (otherwise file size is returned as 0) */
    public static final int MAX_FILESIZE_BYTES = 786432;
    /** The minimum file size in bytes that an image file must have (otherwise file size is returned as 0) */
    public static final int MIN_FILESIZE_BYTES = 9050;
    /** Timeout for network read operations (in milliseconds) */
    public static final int NETWORK_READ_TIMEOUT_MILLIS = 5000;
    /** Timeout for network connection attempts (in milliseconds) */
    public static final int NETWORK_CONNECT_TIMEOUT_MILLIS = 3000;
    /** User-Agent string to send when accessing web pages */
    public static final String NETWORK_DEFAULT_USER_AGENT = "Android";
    /** MIME type that files must have to be eligible */
    public static final String MIME_TYPE_JPEG = "image/jpeg";
    /** RegEx that matches OpenGraph meta tags' property attributes */
    private static final String REGEX_OPEN_GRAPH_TAG_PROPERTY = "property(\\s*)=(\\s*)([\"']{1})(og:[a-zA-Z0-9]+)([\"']{1})";
    /** RegEx that matches OpenGraph meta tags' content attributes */
    private static final String REGEX_OPEN_GRAPH_TAG_CONTENT = "content(\\s*)=(\\s*)([\"']{1})([^\"']+)([\"']{1})";
    /** RegEx that matches the connection between OpenGraph property and content attributes */
    private static final String REGEX_OPEN_GRAPH_TAG_SEPARATOR = "(\\s+)";
    /** RegEx that finds complete OpenGraph meta tags (concatenation of the expressions above) */
    private static final String REGEX_OPEN_GRAPH_TAG = "(" + REGEX_OPEN_GRAPH_TAG_PROPERTY
            + REGEX_OPEN_GRAPH_TAG_SEPARATOR + REGEX_OPEN_GRAPH_TAG_CONTENT + "|" + REGEX_OPEN_GRAPH_TAG_CONTENT
            + REGEX_OPEN_GRAPH_TAG_SEPARATOR + REGEX_OPEN_GRAPH_TAG_PROPERTY + ")";
    /** RegEx that finds an optional base URL that may be set in an HTML document */
    private static final String REGEX_BASE_HREF = "base(\\s*)href=([\\\"']{1})([^\\\"']+)([\\\"']{1})";
    /** The callback that results will be delivered to from this ImageScraper instance */
    private ImageScraperCallback mCallback;
    /** The number of image files that have been requested by the calling Activity */
    private final int mImagesRequestedCount;
    /** The URL of the web page that this ImageScraper instance should work on */
    private final String mURL;
    /** The (optional) pre-defined title of the given web page */
    private final String mTitle;
    /** The URL to the root directory of the given web page */
    private final String mRootURL;
    /** The base URL of the given web page that is usually the directory containing the web page */
    private String mBaseURL;
    /** A pending result that may exist which has not been delivered to the calling Activity yet as there was no callback */
    private ImageScraperResult mPendingResult;
    /** The result that this ImageScraper builds and will ultimately deliver back to the calling Activity */
    private ImageScraperResult mOutput;
    /** Whether the core thread for the ImageChecker may time out or not */
    private boolean mAllowCoreThreadTimeOut;
    /** Custom User-Agent string that will be sent with all requests */
    private String mUserAgent;

    public ImageScraper(ImageScraperCallback callback, String url, int imagesRequestedCount) {
        this(callback, url, imagesRequestedCount, true);
    }

    public ImageScraper(ImageScraperCallback callback, String url, int imagesRequestedCount,
            boolean allowCoreThreadTimeOut) {
        this(callback, url, imagesRequestedCount, allowCoreThreadTimeOut, "");
    }

    public ImageScraper(ImageScraperCallback callback, String url, int imagesRequestedCount,
            boolean allowCoreThreadTimeOut, String title) {
        mCallback = callback;
        mURL = url;
        mTitle = title;
        mImagesRequestedCount = imagesRequestedCount;
        mAllowCoreThreadTimeOut = allowCoreThreadTimeOut;
        mRootURL = makeRootPath(url);
        mBaseURL = makeBasePath(url);
    }

    public void setUserAgent(String userAgent) {
        mUserAgent = userAgent;
    }

    public void setCallback(ImageScraperCallback callback) {
        mCallback = callback;
        if (mPendingResult != null) {
            mCallback.onFinished(mPendingResult); // notify the callback that the ImageScraper has finished and return its results
            mPendingResult = null;
            mCallback = null; // unset the callback as we do not need to receive any further information
        }
    }

    private static String makeBasePath(String url) {
        final int lastSlashPosition = url.lastIndexOf("/");
        if (lastSlashPosition >= 0) {
            return url.substring(0, lastSlashPosition) + "/";
        } else {
            return url;
        }
    }

    private static String makeRootPath(String url) {
        final String urlWithoutProtocol = url.replace("://", "");
        int firstSlashPosition = urlWithoutProtocol.indexOf("/");
        if (!url.equals(urlWithoutProtocol)) {
            firstSlashPosition += 3; // as we have removed 3 chars before and must consider this for the slash position
        }
        if (firstSlashPosition >= 0) {
            return url.substring(0, firstSlashPosition) + "/";
        } else {
            return url;
        }
    }

    @Override
    public void run() {
        if (mCallback != null) {
            mCallback.onStarted();
        }
        // FETCH HTML BEGIN
        BasicHttpParams httpParameters = new BasicHttpParams();
        HttpConnectionParams.setConnectionTimeout(httpParameters, 2500);
        HttpConnectionParams.setSoTimeout(httpParameters, 4000);
        DefaultHttpClient client = new DefaultHttpClient(httpParameters);
        String responseStr;
        try {
            HttpGet httpGet = new HttpGet(mURL);
            httpGet.setHeader("User-Agent", getUserAgent());
            final HttpResponse responseData = client.execute(httpGet);
            responseStr = EntityUtils.toString(responseData.getEntity());
        } catch (Exception e) {
            if (mCallback != null) {
                mCallback.onFinished(new ImageScraperResult(ImageScraperResult.ERROR_IO)); // notify the callback that the ImageScraper has finished and return its results
                mCallback = null; // unset the callback as we do not need to receive any further information
            }
            return;
        }
        if (responseStr == null || responseStr.equals("")) {
            if (mCallback != null) {
                mCallback.onFinished(new ImageScraperResult(ImageScraperResult.ERROR_EMPTY)); // notify the callback that the ImageScraper has finished and return its results
                mCallback = null; // unset the callback as we do not need to receive any further information
            }
            return;
        }
        mOutput = new ImageScraperResult(mURL, mTitle, mImagesRequestedCount);
        // FETCH HTML END

        // SEE IF HTML DOCUMENT HAS ANY BASE PATH SET BEGIN
        final Pattern basePathRegex = Pattern.compile(REGEX_BASE_HREF);
        final Matcher basePath = basePathRegex.matcher(responseStr);
        if (basePath.find()) {
            mBaseURL = basePath.group(3);
        }
        // SEE IF HTML DOCUMENT HAS ANY BASE PATH SET END

        // TRY TO FIND OPEN GRAPH META TAGS BEGIN
        final Pattern openGraphTagRegex = Pattern.compile(REGEX_OPEN_GRAPH_TAG);
        final Matcher openGraphTag = openGraphTagRegex.matcher(responseStr);
        String[] propertyName = new String[2];
        String[] propertyContent = new String[2];
        while (openGraphTag.find()) {
            propertyName[0] = openGraphTag.group(5);
            propertyContent[0] = openGraphTag.group(11);
            propertyName[1] = openGraphTag.group(22);
            propertyContent[1] = openGraphTag.group(16);
            for (int i = 0; i < 2; i++) {
                if (propertyName[i] != null && !propertyName[i].equals("")) {
                    if (propertyName[i].equals("og:url")) {
                        mOutput.setURL(propertyContent[i]);
                    } else if (propertyName[i].equals("og:title")) {
                        mOutput.setTitle(propertyContent[i]);
                    } else if (propertyName[i].equals("og:image")) {
                        mOutput.addImageURL(propertyContent[i], true);
                    }
                }
            }
        }
        // TRY TO FIND OPEN GRAPH META TAGS END

        // TRY TO PARSE FULL HTML DOCUMENT BEGIN
        final ImageURLFinder imageURLFinder = new ImageURLFinder(mRootURL, mBaseURL);
        final Iterable<String> imageURLs = imageURLFinder.find(responseStr);
        // TRY TO PARSE FULL HTML DOCUMENT END

        // GET THE LARGEST IMAGE FILES AND WAIT FOR CALLBACK BEGIN
        new ImageChecker(this, getUserAgent(), mAllowCoreThreadTimeOut).start(imageURLs);
        // GET THE LARGEST IMAGE FILES FROM THE LIST END
    }

    protected String getUserAgent() {
        return mUserAgent == null ? NETWORK_DEFAULT_USER_AGENT : mUserAgent;
    }

    @Override
    public void onImageCheckerFinished(PriorityBlockingQueue<ImageURL> imageURLs) {
        // COLLECT THE LARGEST IMAGE FILES BEGIN
        boolean imageSlotsAvailable = true;
        while (imageSlotsAvailable) { // while images available in queue
            ImageURL imageURL = imageURLs.poll(); // get the next image
            if (imageURL != null) { // if still images in queue
                if (imageURL.getFileSize() > 0) { // if image could be accessed
                    imageSlotsAvailable = mOutput.addImageURL(imageURL.getURL()); // add it to result list
                }
            } else { // if no more images in queue
                imageSlotsAvailable = false; // stop iterating
            }
        }
        // COLLECT THE LARGEST IMAGE FILES END

        if (mCallback != null) {
            mCallback.onFinished(mOutput); // notify the callback that the ImageScraper has finished and return its results
            mCallback = null; // unset the callback as we do not need to receive any further information
        } else {
            mPendingResult = mOutput;
        }
    }

}