org.opensextant.xtext.collectors.sharepoint.SharepointClient.java Source code

Introduction

Here is the source code for org.opensextant.xtext.collectors.sharepoint.SharepointClient.java
Source

/**
 *
 * Copyright 2013-2014 OpenSextant.org
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package org.opensextant.xtext.collectors.sharepoint;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.auth.AuthScope;
import org.apache.http.auth.NTCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.HttpClientBuilder;
import org.opensextant.ConfigException;
import org.opensextant.xtext.collectors.web.WebClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Simple client that pulls down HTML from sharepoint, acquire files and crawl sub-folders.
 * Similar to WebClient, but not really...
 * This is not a general Sharepoint interface library nor a general Sharepoint crawling tool.
 * This *is* a barebones demonstration of how to interact with a Sharepoint site and capture meaningful content in
 * an SP site and its sub-folders.
 * 
 */
public class SharepointClient extends WebClient {

    private Logger log = LoggerFactory.getLogger(getClass());

    /** For testing: 
     * @throws MalformedURLException 
     * @throws ConfigException 
     */
    public SharepointClient(String siteUrl, String archive, String u, String p, String dom)
            throws MalformedURLException, ConfigException {
        super(siteUrl, archive);
        user = u;
        passwd = p;
        domain = dom;
    }

    private String user = null;
    private String passwd = null;
    private String domain = null;

    public void setUser(String u) {
        user = u;
    }

    public void setPasswd(String pw) {
        passwd = pw;
    }

    public void setDomain(String dom) {
        domain = dom;
    }

    private HttpClient currentConn = null;

    public void reset() {
        currentConn = null;
    }

    /**
     * Sharepoint requires NTLM. This client requires a non-null user/passwd/domain.
     * 
     */
    @Override
    public HttpClient getClient() {

        if (currentConn != null) {
            return currentConn;
        }

        HttpClientBuilder clientHelper = HttpClientBuilder.create();

        if (proxyHost != null) {
            clientHelper.setProxy(proxyHost);
        }

        RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
                .build();

        CredentialsProvider creds = new BasicCredentialsProvider();
        creds.setCredentials(AuthScope.ANY, new NTCredentials(user, passwd, server, domain));
        clientHelper.setDefaultCredentialsProvider(creds);
        HttpClient httpClient = clientHelper.setDefaultRequestConfig(globalConfig).build();

        return httpClient;

    }

    /**
     * Recursively parse a site page, limiting the crawl to local items
     * contained within the current folder/page
     * 
     * @param html
     *            HTML text buffer
     * @return
     */
    public Collection<SPLink> parseContentPage(String html, URL pageUrl) {
        Map<String, SPLink> contentLinks = new HashMap<String, SPLink>();
        Pattern href_matcher = Pattern.compile("href=\"([^\"]+)\"");
        Matcher matches = href_matcher.matcher(html);
        while (matches.find()) {
            String link = matches.group(1).trim();
            String link_lc = link.toLowerCase();
            if ("/".equals(link) || "#".equals(link) || link_lc.endsWith(".aspx")
                    || link_lc.startsWith("javascript") || link.startsWith("../") || link.contains("/_layouts/")) {
                continue;
            }
            if (link.endsWith("/")) {
                link = link.substring(0, link.length() - 1);
            }
            try {
                SPLink l = new SPLink(prepURLPath(link).toString(), pageUrl);
                if (l.isResource()) {
                    continue;
                }
                if (!contentLinks.containsKey(l.toString())) {
                    log.info("Found link {}", link);
                    contentLinks.put(l.toString(), l);
                }
            } catch (Exception err) {
                log.error("Failed to parse URL {}", link, err);
            }
        }

        return contentLinks.values();
    }
}