de.l3s.archivepig.enrich.Response.java Source code

Introduction

Here is the source code for de.l3s.archivepig.enrich.Response.java
Source

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015 Helge Holzmann
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package de.l3s.archivepig.enrich;

import com.google.common.collect.Lists;
import de.l3s.archivepig.ArchiveLoader;
import de.l3s.archivepig.DependencyFunc;
import de.l3s.archivepig.EnrichFunc;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;
import org.archive.format.http.HttpHeader;
import org.archive.format.http.HttpHeaders;
import org.archive.format.http.HttpResponse;
import org.archive.format.http.HttpResponseParser;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.ArrayDeque;
import java.util.Deque;
import java.util.HashMap;
import java.util.Map;

import static de.l3s.archivepig.Shortcuts.*;
import static org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;

public class Response extends EnrichFunc implements DependencyFunc {
    @Override
    public void enrich(Tuple data, Tuple enrichment, Object... params) throws Exception {
        long size = get(data, "_record.size");
        long offset = get(data, "_record.offset");
        String filename = get(data, "_record.filename");
        String cdxFile = get(data, "_record.cdxFile");

        if (size < 0 || offset < 0)
            return;

        FileSystem fs = FileSystem.get(UDFContext.getUDFContext().getJobConf());

        Deque<String> cdxSegments = new ArrayDeque<String>(Lists.reverse(list(cdxFile.split("\\/"))));
        cdxSegments.pop(); // remove filename
        String pathExtension = "";
        Path path = new Path(ArchiveLoader.dataPath(), pathExtension + filename);
        while (!fs.exists(path)) {
            if (cdxSegments.isEmpty()) {
                enrichment.append(new HashMap<String, String>());
                enrichment.append(new HashMap<String, String>());
                enrichment.append(null);
                return;
            }
            String cdxSegment = cdxSegments.pop();
            if (cdxSegment.endsWith(".har"))
                cdxSegment = cdxSegment.substring(0, cdxSegment.length() - 4);
            pathExtension = cdxSegment + "/" + pathExtension;
            path = new Path(ArchiveLoader.dataPath(), pathExtension + filename);
        }
        FSDataInputStream fsin = fs.open(path);
        fsin.seek(offset);
        InputStream in = fsin;

        ByteArrayOutputStream recordOutput = new ByteArrayOutputStream();
        try {
            try (BoundedInputStream boundedIn = new BoundedInputStream(in, size);
                    ArchiveReader reader = ArchiveReaderFactory.get(filename, boundedIn, false);) {
                ArchiveRecord record;
                record = reader.get();

                ArchiveRecordHeader header = record.getHeader();
                enrichment.append(header.getHeaderFields());

                record.dump(recordOutput);
            } catch (Exception e) {
                return;
            } finally {
                in.close();
                recordOutput.close();
            }
        } catch (Exception e) {
            return;
        }

        try (InputStream httpResponse = new ByteArrayInputStream(recordOutput.toByteArray())) {
            // ALL COMMENTS ARE NEW VERSION VARIANTS FOR HTTP-CORE 4.3, currently in use 4.2.5
            //        SessionInputBufferImpl sessionInputBuffer = new SessionInputBufferImpl(new HttpTransportMetricsImpl(), 2048);
            //        sessionInputBuffer.bind(httpResponse);
            //        DefaultHttpResponseParserFactory responseParserFactory = new DefaultHttpResponseParserFactory();
            //        HttpMessageParser<HttpResponse> responseParser = responseParserFactory.create(sessionInputBuffer, MessageConstraints.DEFAULT);
            //        HttpResponse response = responseParser.parse();
            //        Header[] httpHeaders = response.getAllHeaders();

            HttpResponseParser parser = new HttpResponseParser();
            HttpResponse response = parser.parse(httpResponse);
            HttpHeaders httpHeaders = response.getHeaders();

            Map<String, String> httpHeadersMap = new HashMap<String, String>();
            for (HttpHeader httpHeader : httpHeaders) {
                httpHeadersMap.put(httpHeader.getName(), httpHeader.getValue());
            }
            enrichment.append(httpHeadersMap);

            //        byte[] payload = new byte[sessionInputBuffer.length()];
            //        sessionInputBuffer.read(payload);

            byte[] payload = IOUtils.toByteArray(response);

            enrichment.append(payload);

            //        HttpEntity entity = new ByteArrayEntity(payload);
            //        output.append(entity == null ? null : EntityUtils.toString(entity));
        } catch (Exception ignored) {
        }
    }

    @Override
    public String getEnrichNode() {
        return "capture";
    }

    @Override
    public String getExtensionNode() {
        return "response";
    }

    @Override
    public FieldSchema[] getSchemaExtensions() {
        return array(field("header", DataType.MAP), field("httpHeader", DataType.MAP),
                field("payload", DataType.BYTEARRAY));
    }

    @Override
    public String getResultingFieldName(String mapping) {
        switch (mapping) {
        case "content":
            return "payload";
        }
        return null;
    }
}