com.datafibers.kafka.connect.FileGenericSourceTask.java Source code

Introduction

Here is the source code for com.datafibers.kafka.connect.FileGenericSourceTask.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 **/

package com.datafibers.kafka.connect;

import static org.apache.avro.Schema.Type.NULL;
import static org.apache.avro.Schema.Type.RECORD;

import java.io.*;
import java.net.URL;
import java.nio.file.FileSystems;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
import java.util.Map.Entry;

import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Parser;
import org.apache.avro.Schema.Type;
import org.apache.avro.SchemaParseException;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.kafka.common.utils.AppInfoParser;
import org.apache.kafka.connect.data.Schema;
import org.apache.kafka.connect.data.SchemaBuilder;
import org.apache.kafka.connect.data.Struct;
import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.source.SourceRecord;
import org.apache.kafka.connect.source.SourceTask;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class FileGenericSourceTask extends SourceTask {
    private static final Logger log = LoggerFactory.getLogger(FileGenericSourceTask.class);
    public static final String FILENAME_FIELD = "filename";
    public static final String POSITION_FIELD = "position";
    public static final String FILENAME_EXT_PROCESSING = ".processing";
    public static final String FILENAME_EXT_PROCESSED = ".processed";

    private String topic;
    private String location;
    private String glob;
    private int interval;
    private boolean overwrite;
    private boolean schemaValidate;
    private Schema dataSchema;

    private List<Path> processedPaths = new ArrayList<Path>();
    private List<Path> inProgressPaths = new ArrayList<Path>();

    private String filename;
    private File fileInProcessing; //.processing
    private File fileProcessed; //.processed
    private InputStream stream;
    private BufferedReader reader = null;
    private char[] buffer = new char[1024];
    private int offset = 0;
    private Long streamOffset;

    @Override
    public String version() {
        return AppInfoParser.getVersion();
    }

    @Override
    public void start(Map<String, String> props) {
        topic = props.get(FileGenericSourceConnector.TOPIC_CONFIG);
        location = props.get(FileGenericSourceConnector.FILE_LOCATION_CONFIG);
        glob = props.get(FileGenericSourceConnector.FILE_LOCATION_CONFIG)
                .concat(props.get(FileGenericSourceConnector.FILE_GLOB_CONFIG));
        interval = Integer.parseInt(props.get(FileGenericSourceConnector.FILE_INTERVAL_CONFIG)) * 1000;
        overwrite = Boolean.valueOf(props.get(FileGenericSourceConnector.FILE_OVERWRITE_CONFIG));

        findMatch();
        String schemaUri = props.get(FileGenericSourceConnector.SCHEMA_URI_CONFIG);
        String schemaIgnored = props.get(FileGenericSourceConnector.SCHEMA_IGNORED);

        if (schemaIgnored.equalsIgnoreCase("true")) {
            schemaValidate = false;

        } else {
            // Get avro schema from registry and build proper schema POJO from it
            schemaValidate = true;
            String schemaSubject = props.get(FileGenericSourceConnector.SCHEMA_SUBJECT_CONFIG);
            String schemaVersion = props.get(FileGenericSourceConnector.SCHEMA_VERSION_CONFIG);
            String fullUrl = String.format("%s/subjects/%s/versions/%s", schemaUri, schemaSubject, schemaVersion);

            String schemaString = null;
            BufferedReader br = null;
            try {
                StringBuilder response = new StringBuilder();
                String line;
                br = new BufferedReader(new InputStreamReader(new URL(fullUrl).openStream()));
                while ((line = br.readLine()) != null) {
                    response.append(line);
                }

                JsonNode responseJson = new ObjectMapper().readValue(response.toString(), JsonNode.class);
                schemaString = responseJson.get("schema").asText();
                log.info("Schem String is " + schemaString);
            } catch (Exception ex) {
                throw new ConnectException("Unable to retrieve schema from Schema Registry", ex);
            } finally {
                try {
                    if (br != null)
                        br.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            org.apache.avro.Schema avroSchema = null;
            try {
                avroSchema = new Parser().parse(schemaString);
            } catch (SchemaParseException ex) {
                throw new ConnectException(
                        String.format("Unable to succesfully parse schema from: %s", schemaString), ex);
            }

            SchemaBuilder builder = null;
            // TODO: Add other avro schema types
            switch (avroSchema.getType()) {
            case RECORD: {
                builder = SchemaBuilder.struct();
                break;
            }
            case STRING: {
                builder = SchemaBuilder.string();
                break;
            }
            case INT: {
                builder = SchemaBuilder.int32();
                break;
            }
            case BOOLEAN: {
                builder = SchemaBuilder.bool();
                break;
            }
            default:
                builder = SchemaBuilder.string();
            }

            if (avroSchema.getFullName() != null)
                builder.name(avroSchema.getFullName());
            if (avroSchema.getDoc() != null)
                builder.doc(avroSchema.getDoc());

            if (RECORD.equals(avroSchema.getType()) && avroSchema.getFields() != null
                    && !avroSchema.getFields().isEmpty()) {
                for (org.apache.avro.Schema.Field field : avroSchema.getFields()) {
                    boolean hasDefault = field.defaultValue() != null;

                    SchemaBuilder innerBuilder = getInnerBuilder(field);

                    if (hasDefault)
                        innerBuilder.optional();

                    builder.field(field.name(), innerBuilder.build());
                }
            }

            dataSchema = builder.build();
        }

    }

    @Override
    public List<SourceRecord> poll() throws InterruptedException {
        if (!inProgressPaths.isEmpty()) {
            try {
                Path currentPath = inProgressPaths.remove(0);
                processedPaths.add(currentPath);
                filename = currentPath.getFileName().toString();
                fileInProcessing = FileUtils.getFile(currentPath.toString() + FILENAME_EXT_PROCESSING);
                fileProcessed = FileUtils.getFile(currentPath.toString() + FILENAME_EXT_PROCESSED);
                FileUtils.moveFile(FileUtils.getFile(currentPath.toString()), fileInProcessing);

                stream = new FileInputStream(fileInProcessing);

                Map<String, Object> offset = context.offsetStorageReader()
                        .offset(Collections.singletonMap(FILENAME_FIELD, filename));
                if (offset != null && !overwrite) {
                    log.info("Found previous offset, will not process {}", filename);
                    return null;
                } else
                    streamOffset = 0L;

                reader = new BufferedReader(new InputStreamReader(stream));
                log.info("Opened {} for reading", filename);
            } catch (IOException e) {
                throw new ConnectException(String.format("Unable to open file %", filename), e);
            }
        } else {
            log.warn("********* Waiting for file that meets the glob criteria! *********");
            synchronized (this) {
                this.wait(interval);
                findMatch();
            }
            return null;
        }

        ArrayList<SourceRecord> records = new ArrayList<SourceRecord>();
        //StringBuilder fileContent = new StringBuilder();

        try {
            final BufferedReader readerCopy;
            synchronized (this) {
                readerCopy = reader;
            }
            if (readerCopy == null)
                return null;

            int nread = 0;
            while (readerCopy.ready()) {
                nread = readerCopy.read(buffer, offset, buffer.length - offset);
                log.trace("Read {} bytes from {}", nread, filename);

                if (nread > 0) {
                    offset += nread;
                    if (offset == buffer.length) {
                        char[] newbuf = new char[buffer.length * 2];
                        System.arraycopy(buffer, 0, newbuf, 0, buffer.length);
                        buffer = newbuf;
                    }

                    String line;
                    do {
                        line = extractLine();
                        if (line != null) {
                            line = line.trim();
                            log.trace("Read a line from {}", filename);
                            if (records == null)
                                records = new ArrayList<>();
                            /*                records.add(new SourceRecord(offsetKey(filename), offsetValue(streamOffset), topic,
                                                  dataSchema, structDecodingRoute(line, filename)));*/
                            if (schemaValidate) {
                                records.add(new SourceRecord(offsetKey(filename), offsetValue(streamOffset), topic,
                                        dataSchema, structDecodingRoute(line, filename)));
                            } else {
                                log.info("STRING SCHEMA Processing");
                                records.add(new SourceRecord(offsetKey(filename), offsetValue(streamOffset), topic,
                                        Schema.STRING_SCHEMA, line));
                            }

                        }
                        new ArrayList<SourceRecord>();
                    } while (line != null);
                }
            }

            // Finish processing and rename as processed.
            FileUtils.moveFile(fileInProcessing, fileProcessed);

            if (nread <= 0)
                synchronized (this) {
                    this.wait(1000);
                }

            return records;

        } catch (IOException e) {
            throw new ConnectException(String.format("Unable to read file %", filename), e);
        }

    }

    @Override
    public void stop() {
        log.trace("Stopping");
        synchronized (this) {
            try {
                if (stream != null && stream != System.in) {
                    stream.close();
                    log.trace("Closed input stream");
                }
            } catch (IOException e) {
                log.error("Failed to close FileGenericSourceTask stream: ", e);
            }
            this.notify();
        }
    }

    /**
     * Looks for files that meet the glob criteria. If any found they will be added to the list of
     * files to be processed
     */
    private void findMatch() {
        final PathMatcher globMatcher = FileSystems.getDefault().getPathMatcher("glob:".concat(glob));

        try {
            Files.walkFileTree(Paths.get(location), new SimpleFileVisitor<Path>() {
                @Override
                public FileVisitResult visitFile(Path path, BasicFileAttributes attributes) throws IOException {
                    if (globMatcher.matches(path)) {
                        if (!processedPaths.contains(path)) {
                            inProgressPaths.add(path);
                        }
                    }
                    return FileVisitResult.CONTINUE;
                }

                @Override
                public FileVisitResult visitFileFailed(Path file, IOException e) throws IOException {
                    return FileVisitResult.CONTINUE;
                }
            });
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private SchemaBuilder getInnerBuilder(Field field) {
        return getInnerBuilder(field, field.schema().getType());
    }

    private SchemaBuilder getInnerBuilder(Field field, Type type) {
        boolean hasDefault = field.defaultValue() != null;
        SchemaBuilder innerBuilder = null;
        switch (type) {
        case STRING: {
            innerBuilder = SchemaBuilder.string();
            if (hasDefault)
                innerBuilder.defaultValue(field.defaultValue().asText());
            break;
        }
        case INT: {
            innerBuilder = SchemaBuilder.int32();
            if (hasDefault)
                innerBuilder.defaultValue(field.defaultValue().asInt());
            break;
        }
        case BOOLEAN: {
            innerBuilder = SchemaBuilder.bool();
            if (hasDefault)
                innerBuilder.defaultValue(field.defaultValue().asBoolean());
            break;
        }
        case UNION: {
            for (org.apache.avro.Schema schema : field.schema().getTypes()) {
                if (!schema.getType().equals(NULL))
                    return getInnerBuilder(field, schema.getType());
            }
        }
        default:
            throw new ConnectException(
                    "Unable to build schema because there is no case for type " + field.schema().getType());
        }
        return innerBuilder;
    }

    private String extractLine() {
        int until = -1, newStart = -1;
        for (int i = 0; i < offset; i++) {
            if (buffer[i] == '\n') {
                until = i;
                newStart = i + 1;
                break;
            } else if (buffer[i] == '\r') {
                // We need to check for \r\n, so we must skip this if we can't check the next char
                if (i + 1 >= offset)
                    return null;

                until = i;
                newStart = (buffer[i + 1] == '\n') ? i + 2 : i + 1;
                break;
            }
        }

        if (until != -1) {
            String result = new String(buffer, 0, until);
            System.arraycopy(buffer, newStart, buffer, 0, buffer.length - newStart);
            offset = offset - newStart;
            if (streamOffset != null)
                streamOffset += newStart;
            return result;
        } else {
            return null;
        }
    }

    private Map<String, String> offsetKey(String filename) {
        return Collections.singletonMap(FILENAME_FIELD, filename);
    }

    private Map<String, Long> offsetValue(Long pos) {
        return Collections.singletonMap(POSITION_FIELD, pos);
    }

    /**
     * Decode Csv to struct according to schema form Confluent schema registry
     * @param line
     * @return struct of decoded
     */
    public Struct structDecodingRoute(String line, String file_name) {
        Struct st = null;
        switch (FilenameUtils.getExtension(file_name).toLowerCase()) {
        case "json":
            log.info("Read line @@" + line + "@@ from Json File " + file_name);
            st = structDecodingFromJson(line);
            break;
        case "csv":
        case "tsv":
            log.info("Read line @@" + line + "@@ from Csv File " + file_name);
            st = structDecodingFromCsv(line);
            break;
        default:
            log.info("Default file extension not processing");
        }
        return st;
    }

    /**
     * Decode Json to struct according to schema form Confluent schema registry
     * @param line
     * @return struct of decoded
     */
    public Struct structDecodingFromJson(String line) {

        if (line.length() > 0) {
            JsonNode json = null;
            try {
                json = new ObjectMapper().readValue(line, JsonNode.class);
            } catch (IOException ex) {
                throw new ConnectException(String.format("Unable to parse %s into a valid JSON", filename), ex);
            }

            Struct struct = new Struct(dataSchema);
            Iterator<Entry<String, JsonNode>> iterator = json.getFields();
            while (iterator.hasNext()) {
                Entry<String, JsonNode> entry = iterator.next();
                Object value = null;
                org.apache.kafka.connect.data.Field theField = dataSchema.field(entry.getKey());
                if (theField != null) {
                    switch (theField.schema().type()) {
                    case STRING: {
                        value = entry.getValue().asText();
                        break;
                    }
                    case INT32: {
                        value = entry.getValue().asInt();
                        break;
                    }
                    case BOOLEAN: {
                        value = entry.getValue().asBoolean();
                        break;
                    }
                    default:
                        value = entry.getValue().asText();
                    }
                }
                struct.put(entry.getKey(), value);

            }
            return struct;
        }
        return null;
    }

    /**
     * Decode Csv to struct according to schema form Confluent schema registry
     * @param line
     * @return struct of decoded
     */
    public Struct structDecodingFromCsv(String line) {
        if (line.length() > 0) {
            Struct struct = new Struct(dataSchema);
            JsonNode json = null;
            try {
                // TODO support other type of files fro here
                CSVParser csvParser = CSVFormat.EXCEL.withIgnoreEmptyLines().withIgnoreHeaderCase()
                        .withRecordSeparator('\n').withQuote('"').withEscape('\\').withDelimiter(',').withTrim()
                        .parse(new StringReader(line));

                // Since this is single line parser, we get element 0 only
                CSVRecord entry = csvParser.getRecords().get(0);
                List<org.apache.kafka.connect.data.Field> fields = dataSchema.fields();
                int schema_fields_size = fields.size();
                log.info("schema_fields_size = " + schema_fields_size);

                for (int index = 0; index <= schema_fields_size - 1; index++) {
                    Object value = null;
                    org.apache.kafka.connect.data.Field theField = fields.get(index);
                    log.info("printed indexed " + index + " fields: " + theField.name() + ":"
                            + theField.schema().type());
                    if (theField != null) {
                        switch (theField.schema().type()) {
                        case STRING: {
                            value = entry.get(index);
                            break;
                        }
                        case INT32: {
                            value = Integer.parseInt(entry.get(index));
                            break;
                        }
                        case BOOLEAN: {
                            value = Boolean.parseBoolean(entry.get(index));
                            break;
                        }
                        default:
                            value = entry.get(index);
                        }
                    }
                    struct.put(theField.name(), value);
                }
            } catch (IOException ex) {
                throw new ConnectException(String.format("Unable to parse %s into a valid CSV", filename), ex);
            }
            return struct;
        }
        return null;
    }
}