com.music.tools.SongDBDownloader.java Source code

Java tutorial

Introduction

Here is the source code for com.music.tools.SongDBDownloader.java

Source

/*
 * Computoser is a music-composition algorithm and a website to present the results
 * Copyright (C) 2012-2014  Bozhidar Bozhanov
 *
 * Computoser is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * Computoser is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with Computoser.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.music.tools;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;

import com.google.common.collect.Sets;
import com.google.common.io.CharStreams;
import com.google.common.io.Files;

public class SongDBDownloader {
    public static void main(String[] args) throws Exception {
        HttpClient client = new DefaultHttpClient();

        //        HttpHost proxy = new HttpHost("localhost", 8888);
        //        client.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);

        HttpContext ctx = new BasicHttpContext();

        HttpUriRequest req = new HttpGet(
                "http://www.hooktheory.com/analysis/view/the-beatles/i-want-to-hold-your-hand");
        client.execute(req, ctx);
        req.abort();

        List<String> urls = getSongUrls(
                "http://www.hooktheory.com/analysis/browseSearch?sQuery=&sOrderBy=views&nResultsPerPage=525&nPage=1",
                client, ctx);
        List<List<? extends NameValuePair>> paramsList = new ArrayList<>(urls.size());
        for (String songUrl : urls) {
            paramsList.addAll(getSongParams(songUrl, client, ctx));
        }
        int i = 0;
        for (List<? extends NameValuePair> params : paramsList) {

            HttpPost request = new HttpPost("http://www.hooktheory.com/songs/getXML");

            request.setHeader("User-Agent",
                    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20100101 Firefox/15.0.1");
            request.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            request.setHeader("Accept-Encoding", "gzip, deflate");
            request.setHeader("Accept-Language", "en,en-us;q=0.7,bg;q=0.3");
            request.setHeader("Content-Type", "application/x-www-form-urlencoded");
            request.setHeader("Origin", "http://www.hooktheory.com");
            request.setHeader("Referer",
                    URLEncoder.encode("http://www.hooktheory.com/swf/DNALive Version 1.0.131.swf", "utf-8"));

            HttpEntity entity = new UrlEncodedFormEntity(params);
            request.setEntity(entity);

            try {
                HttpResponse response = client.execute(request, ctx);
                if (response.getStatusLine().getStatusCode() == 200) {
                    InputStream is = response.getEntity().getContent();
                    String xml = CharStreams.toString(new InputStreamReader(is));
                    is.close();
                    Files.write(xml, new File("c:/tmp/musicdb/" + i + ".xml"), Charset.forName("utf-8"));
                } else {
                    System.out.println(response.getStatusLine());
                    System.out.println(params);
                }
                i++;
                request.abort();
            } catch (Exception ex) {
                System.out.println(params);
                ex.printStackTrace();
            }
        }
    }

    private static List<List<NameValuePair>> getSongParams(String songUrl, HttpClient client, HttpContext ctx)
            throws IOException {
        String html = getResponseAsString(songUrl, client, ctx);
        List<List<NameValuePair>> result = new ArrayList<>();
        Set<String> fields = Sets.newHashSet("username", "artist", "song", "section", "revision", "HTID",
                "sCSRFToken");
        for (String field : fields) {
            Pattern pattern = Pattern.compile("'" + field + "':'(.+)'");
            Matcher m = pattern.matcher(html);
            int i = 0;
            while (m.find()) {
                // supporting multiple instances of the flash client per page.
                List<NameValuePair> httpParams;
                if (result.size() - 1 < i) {
                    httpParams = new ArrayList<NameValuePair>();
                    result.add(httpParams);
                } else {
                    httpParams = result.get(i);
                }
                if (field.equals("sCSRFToken")) {
                    String token = m.group(1);
                    httpParams.add(new BasicNameValuePair("YII_CSRF_TOKEN", token));
                } else {
                    httpParams.add(new BasicNameValuePair(field, m.group(1)));
                }
                i++;
            }
        }

        return result;
    }

    private static List<String> getSongUrls(String list, HttpClient client, HttpContext ctx) throws IOException {
        String html = getResponseAsString(list, client, ctx);
        // Right, NEVER use regex for html parsing. Only this time :)
        Pattern pattern = Pattern.compile("href=\"([\\w/\\-:\\.]+)\" ");
        Matcher m = pattern.matcher(html);
        List<String> result = new ArrayList<>();
        while (m.find()) {
            result.add(m.group(1));
        }

        return result;
    }

    private static String getResponseAsString(String urlString, HttpClient client, HttpContext ctx)
            throws IOException {
        HttpUriRequest req = new HttpGet(urlString);
        InputStream is = client.execute(req, ctx).getEntity().getContent();
        String result = CharStreams.toString(new InputStreamReader(is));
        is.close();
        req.abort();
        return result;
    }
}