Android Open Source - AndroidImageScraper Image Scraper






From Project

Back to project page AndroidImageScraper.

License

The source code is released under:

Apache License

If you think the Android project AndroidImageScraper listed in this page is inappropriate, such as containing malicious code/tools or violating the copyright, please email info at java2s dot com, thanks.

Java Source Code

package im.delight.imagescraper;
/*w w w. j a  va  2 s .  c  om*/
/**
 * Copyright 2013 www.delight.im <info@delight.im>
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.concurrent.PriorityBlockingQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpConnectionParams;
import org.apache.http.util.EntityUtils;

/** Extracts all image files from a given web page and returns as many of the largest files' URLs as requested */
public class ImageScraper extends Thread implements ImageCheckerCallback {

  /** The maximum file size in bytes that an image file may have (otherwise file size is returned as 0) */
  public static final int MAX_FILESIZE_BYTES = 786432;
  /** The minimum file size in bytes that an image file must have (otherwise file size is returned as 0) */
  public static final int MIN_FILESIZE_BYTES = 9050;
  /** Timeout for network read operations (in milliseconds) */
  public static final int NETWORK_READ_TIMEOUT_MILLIS = 5000;
  /** Timeout for network connection attempts (in milliseconds) */
  public static final int NETWORK_CONNECT_TIMEOUT_MILLIS = 3000;
  /** User-Agent string to send when accessing web pages */
  public static final String NETWORK_DEFAULT_USER_AGENT = "Android";
  /** MIME type that files must have to be eligible */
  public static final String MIME_TYPE_JPEG = "image/jpeg";
  /** RegEx that matches OpenGraph meta tags' property attributes */
  private static final String REGEX_OPEN_GRAPH_TAG_PROPERTY = "property(\\s*)=(\\s*)([\"']{1})(og:[a-zA-Z0-9]+)([\"']{1})";
  /** RegEx that matches OpenGraph meta tags' content attributes */
  private static final String REGEX_OPEN_GRAPH_TAG_CONTENT = "content(\\s*)=(\\s*)([\"']{1})([^\"']+)([\"']{1})";
  /** RegEx that matches the connection between OpenGraph property and content attributes */
  private static final String REGEX_OPEN_GRAPH_TAG_SEPARATOR = "(\\s+)";
  /** RegEx that finds complete OpenGraph meta tags (concatenation of the expressions above) */
  private static final String REGEX_OPEN_GRAPH_TAG = "("+REGEX_OPEN_GRAPH_TAG_PROPERTY+REGEX_OPEN_GRAPH_TAG_SEPARATOR+REGEX_OPEN_GRAPH_TAG_CONTENT+"|"+REGEX_OPEN_GRAPH_TAG_CONTENT+REGEX_OPEN_GRAPH_TAG_SEPARATOR+REGEX_OPEN_GRAPH_TAG_PROPERTY+")";
  /** RegEx that finds an optional base URL that may be set in an HTML document */
  private static final String REGEX_BASE_HREF = "base(\\s*)href=([\\\"']{1})([^\\\"']+)([\\\"']{1})";
  /** The callback that results will be delivered to from this ImageScraper instance */
  private ImageScraperCallback mCallback;
  /** The number of image files that have been requested by the calling Activity */
  private final int mImagesRequestedCount;
  /** The URL of the web page that this ImageScraper instance should work on */
  private final String mURL;
  /** The (optional) pre-defined title of the given web page */
  private final String mTitle;
  /** The URL to the root directory of the given web page */
  private final String mRootURL;
  /** The base URL of the given web page that is usually the directory containing the web page */
  private String mBaseURL;
  /** A pending result that may exist which has not been delivered to the calling Activity yet as there was no callback */
  private ImageScraperResult mPendingResult;
  /** The result that this ImageScraper builds and will ultimately deliver back to the calling Activity */
  private ImageScraperResult mOutput;
  /** Whether the core thread for the ImageChecker may time out or not */
  private boolean mAllowCoreThreadTimeOut;
  /** Custom User-Agent string that will be sent with all requests */
  private String mUserAgent;

  public ImageScraper(ImageScraperCallback callback, String url, int imagesRequestedCount) {
    this(callback, url, imagesRequestedCount, true);
  }

  public ImageScraper(ImageScraperCallback callback, String url, int imagesRequestedCount, boolean allowCoreThreadTimeOut) {
    this(callback, url, imagesRequestedCount, allowCoreThreadTimeOut, "");
  }

  public ImageScraper(ImageScraperCallback callback, String url, int imagesRequestedCount, boolean allowCoreThreadTimeOut, String title) {
    mCallback = callback;
    mURL = url;
    mTitle = title;
    mImagesRequestedCount = imagesRequestedCount;
    mAllowCoreThreadTimeOut = allowCoreThreadTimeOut;
    mRootURL = makeRootPath(url);
    mBaseURL = makeBasePath(url);
  }
  
  public void setUserAgent(String userAgent) {
    mUserAgent = userAgent;
  }
  
  public void setCallback(ImageScraperCallback callback) {
    mCallback = callback;
    if (mPendingResult != null) {
      mCallback.onFinished(mPendingResult); // notify the callback that the ImageScraper has finished and return its results
      mPendingResult = null;
      mCallback = null; // unset the callback as we do not need to receive any further information
    }
  }
  
  private static String makeBasePath(String url) {
    final int lastSlashPosition = url.lastIndexOf("/");
    if (lastSlashPosition >= 0) {
      return url.substring(0, lastSlashPosition)+"/";
    }
    else {
      return url;
    }
  }
  
  private static String makeRootPath(String url) {
    final String urlWithoutProtocol = url.replace("://", "");
    int firstSlashPosition = urlWithoutProtocol.indexOf("/");
    if (!url.equals(urlWithoutProtocol)) {
      firstSlashPosition += 3; // as we have removed 3 chars before and must consider this for the slash position
    }
    if (firstSlashPosition >= 0) {
      return url.substring(0, firstSlashPosition)+"/";
    }
    else {
      return url;
    }
  }

  @Override
  public void run() {
    if (mCallback != null) {
      mCallback.onStarted();
    }
    // FETCH HTML BEGIN
    BasicHttpParams httpParameters = new BasicHttpParams();
    HttpConnectionParams.setConnectionTimeout(httpParameters, 2500);
    HttpConnectionParams.setSoTimeout(httpParameters, 4000);
    DefaultHttpClient client = new DefaultHttpClient(httpParameters);
    String responseStr;
    try {
      HttpGet httpGet = new HttpGet(mURL);
      httpGet.setHeader("User-Agent", getUserAgent());
      final HttpResponse responseData = client.execute(httpGet);
      responseStr = EntityUtils.toString(responseData.getEntity());
    }
    catch (Exception e) {
      if (mCallback != null) {
        mCallback.onFinished(new ImageScraperResult(ImageScraperResult.ERROR_IO)); // notify the callback that the ImageScraper has finished and return its results
        mCallback = null; // unset the callback as we do not need to receive any further information
      }
      return;
    }
    if (responseStr == null || responseStr.equals("")) {
      if (mCallback != null) {
        mCallback.onFinished(new ImageScraperResult(ImageScraperResult.ERROR_EMPTY)); // notify the callback that the ImageScraper has finished and return its results
        mCallback = null; // unset the callback as we do not need to receive any further information
      }
      return;
    }
    mOutput = new ImageScraperResult(mURL, mTitle, mImagesRequestedCount);
    // FETCH HTML END

    // SEE IF HTML DOCUMENT HAS ANY BASE PATH SET BEGIN
    final Pattern basePathRegex = Pattern.compile(REGEX_BASE_HREF);
    final Matcher basePath = basePathRegex.matcher(responseStr);
    if (basePath.find()) {
      mBaseURL = basePath.group(3);
    }
    // SEE IF HTML DOCUMENT HAS ANY BASE PATH SET END

    // TRY TO FIND OPEN GRAPH META TAGS BEGIN
    final Pattern openGraphTagRegex = Pattern.compile(REGEX_OPEN_GRAPH_TAG);
    final Matcher openGraphTag = openGraphTagRegex.matcher(responseStr);
    String[] propertyName = new String[2];
    String[] propertyContent = new String[2];
    while (openGraphTag.find()) {
      propertyName[0] = openGraphTag.group(5);
      propertyContent[0] = openGraphTag.group(11);
      propertyName[1] = openGraphTag.group(22);
      propertyContent[1] = openGraphTag.group(16);
      for (int i = 0; i < 2; i++) {
        if (propertyName[i] != null && !propertyName[i].equals("")) {
          if (propertyName[i].equals("og:url")) {
            mOutput.setURL(propertyContent[i]);
          }
          else if (propertyName[i].equals("og:title")) {
            mOutput.setTitle(propertyContent[i]);
          }
          else if (propertyName[i].equals("og:image")) {
            mOutput.addImageURL(propertyContent[i], true);
          }
        }
      }
    }
    // TRY TO FIND OPEN GRAPH META TAGS END

    // TRY TO PARSE FULL HTML DOCUMENT BEGIN
    final ImageURLFinder imageURLFinder = new ImageURLFinder(mRootURL, mBaseURL);
    final Iterable<String> imageURLs = imageURLFinder.find(responseStr);
    // TRY TO PARSE FULL HTML DOCUMENT END

    // GET THE LARGEST IMAGE FILES AND WAIT FOR CALLBACK BEGIN
    new ImageChecker(this, getUserAgent(), mAllowCoreThreadTimeOut).start(imageURLs);
    // GET THE LARGEST IMAGE FILES FROM THE LIST END
  }
  
  protected String getUserAgent() {
    return mUserAgent == null ? NETWORK_DEFAULT_USER_AGENT : mUserAgent;
  }

  @Override
  public void onImageCheckerFinished(PriorityBlockingQueue<ImageURL> imageURLs) {
    // COLLECT THE LARGEST IMAGE FILES BEGIN
    boolean imageSlotsAvailable = true;
    while (imageSlotsAvailable) { // while images available in queue
      ImageURL imageURL = imageURLs.poll(); // get the next image
      if (imageURL != null) { // if still images in queue
        if (imageURL.getFileSize() > 0) { // if image could be accessed
          imageSlotsAvailable = mOutput.addImageURL(imageURL.getURL()); // add it to result list
        }
      }
      else { // if no more images in queue
        imageSlotsAvailable = false; // stop iterating
      }
    }
    // COLLECT THE LARGEST IMAGE FILES END
    
    if (mCallback != null) {
      mCallback.onFinished(mOutput); // notify the callback that the ImageScraper has finished and return its results
      mCallback = null; // unset the callback as we do not need to receive any further information
    }
    else {
      mPendingResult = mOutput;
    }
  }
  
}




Java Source Code List

im.delight.imagescraper.ImageCheckerCallback.java
im.delight.imagescraper.ImageCheckerTask.java
im.delight.imagescraper.ImageChecker.java
im.delight.imagescraper.ImageScraperCallback.java
im.delight.imagescraper.ImageScraperResult.java
im.delight.imagescraper.ImageScraper.java
im.delight.imagescraper.ImageURLFinder.java
im.delight.imagescraper.ImageURL.java