org.asqatasun.crawler.util.CrawlUtils.java Source code

Java tutorial

Introduction

Here is the source code for org.asqatasun.crawler.util.CrawlUtils.java

Source

/*
 * Asqatasun - Automated webpage assessment
 * Copyright (C) 2008-2015  Asqatasun.org
 *
 * This file is part of Asqatasun.
 *
 * Asqatasun is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Contact us by mail: asqatasun AT asqatasun DOT org
 */
package org.asqatasun.crawler.util;

import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
import java.util.NoSuchElementException;
import javax.imageio.ImageIO;
import javax.imageio.ImageWriter;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.archive.io.RecordingInputStream;
import org.mozilla.universalchardet.UniversalDetector;

/**
 *
 * @author jkowalczyk
 */
public final class CrawlUtils {

    private static final Logger LOGGER = Logger.getLogger(CrawlUtils.class);
    private static final String DEFAULT_CHARSET = "UTF-8";
    private static final int BYTE_BUFFER_SIZE = 1000;
    private static final String DEFAULT_IMG_EXTENSION = "jpg";

    private CrawlUtils() {
    };

    /**
     * This method converts the sourceCode into UTF-8 charset to ensure the
     * charset compatibility with the source and the charset used for the
     * persistence.
     * @param recis
     * @param charset
     * @return
     * @throws IOException
     */
    public static String convertSourceCodeIntoUtf8(RecordingInputStream recis, String charset) throws IOException {
        ByteArrayOutputStream baos = null;
        try {
            if (!charset.equalsIgnoreCase(DEFAULT_CHARSET)) {
                Charset utf8charset = Charset.forName(DEFAULT_CHARSET);
                Charset incomingCharset = Charset.forName(charset);
                baos = new ByteArrayOutputStream();
                recis.getReplayInputStream().readContentTo(baos);
                ByteBuffer inputBuffer = ByteBuffer.wrap(baos.toByteArray());
                CharBuffer data = incomingCharset.decode(inputBuffer);
                ByteBuffer outputBuffer = utf8charset.encode(data);
                byte[] outputData = outputBuffer.array();
                String source = new String(outputData);
                return source;
            } else {
                baos = new ByteArrayOutputStream();
                recis.getReplayInputStream().readContentTo(baos);
                String source = baos.toString(charset);
                return source;
            }
        } finally {
            IOUtils.closeQuietly(recis);
            if (baos != null) {
                IOUtils.closeQuietly(baos);
            }
        }
    }

    /**
     * This methods tests if a charset is valid regarding the charset nio API.
     * @param charset
     * @return
     */
    public static boolean isValidCharset(String charset) {
        try {
            Charset.forName(charset);
        } catch (UnsupportedCharsetException e) {
            return false;
        }
        return true;
    }

    /**
     * This method extracts the charset from the html source code.
     * If the charset is not specified, it is set to UTF-8 by default
     * @param is
     * @return
     */
    public static String extractCharset(InputStream is) throws java.io.IOException {
        byte[] buf = new byte[4096];
        UniversalDetector detector = new UniversalDetector(null);
        int nread;
        while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
            detector.handleData(buf, 0, nread);
        }
        detector.dataEnd();

        String encoding = detector.getDetectedCharset();
        if (encoding != null) {
            LOGGER.debug("Detected encoding = " + encoding);
        } else {
            LOGGER.debug("No encoding detected.");
        }

        detector.reset();
        if (encoding != null && CrawlUtils.isValidCharset(encoding)) {
            return encoding;
        } else {
            return DEFAULT_CHARSET;
        }
    }

    /**
     * Get an image extension from the image url
     * @param imageUrl
     * @return
     */
    public static String getImageExtension(String imageUrl) {
        String ext = imageUrl.substring(imageUrl.lastIndexOf('.') + 1);
        try {
            java.util.Iterator<ImageWriter> it = ImageIO.getImageWritersBySuffix(ext);
            if (it.next() != null) {
                return ext;
            }
        } catch (NoSuchElementException ex) {
            return DEFAULT_IMG_EXTENSION;
        }
        return DEFAULT_IMG_EXTENSION;
    }

    /**
     * Get the raw content of an image
     * @param is
     * @param imgExtension
     * @return
     */
    public static byte[] getImageContent(InputStream is, String imgExtension) {
        // O P E N
        ByteArrayOutputStream baos = new ByteArrayOutputStream(BYTE_BUFFER_SIZE);
        byte[] resultImageAsRawBytes = null;
        try {
            BufferedImage image = ImageIO.read(is);
            // W R I T E
            ImageIO.write(image, imgExtension, baos);
            // C L O S E
            baos.flush();
            resultImageAsRawBytes = baos.toByteArray();
            baos.close();
        } catch (IOException ex) {
            LOGGER.warn(ex.getMessage());
        }
        return resultImageAsRawBytes;
    }

}