org.talend.repository.hdfs.ui.metadata.ExtractTextFileSchemaService.java Source code

Java tutorial

Introduction

Here is the source code for org.talend.repository.hdfs.ui.metadata.ExtractTextFileSchemaService.java

Source

// ============================================================================
//
// Copyright (C) 2006-2016 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.repository.hdfs.ui.metadata;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.eclipse.core.resources.IProject;
import org.eclipse.core.runtime.CoreException;
import org.eclipse.emf.common.util.EMap;
import org.eclipse.jface.preference.IPreferenceStore;
import org.talend.commons.exception.ExceptionHandler;
import org.talend.commons.exception.PersistenceException;
import org.talend.commons.utils.data.text.IndiceHelper;
import org.talend.commons.utils.encoding.CharsetToolkit;
import org.talend.commons.utils.workbench.resources.ResourceUtils;
import org.talend.core.GlobalServiceRegister;
import org.talend.core.model.general.Project;
import org.talend.core.model.metadata.MetadataToolHelper;
import org.talend.core.model.metadata.builder.connection.ConnectionFactory;
import org.talend.core.model.metadata.builder.connection.MetadataColumn;
import org.talend.core.model.metadata.builder.connection.MetadataTable;
import org.talend.core.model.metadata.types.JavaDataTypeHelper;
import org.talend.core.model.metadata.types.JavaTypesManager;
import org.talend.core.ui.preference.metadata.MetadataTypeLengthConstants;
import org.talend.core.ui.services.IDesignerCoreUIService;
import org.talend.core.utils.CsvArray;
import org.talend.core.utils.TalendQuoteUtils;
import org.talend.designer.hdfsbrowse.exceptions.HadoopServerException;
import org.talend.designer.hdfsbrowse.manager.HadoopOperationManager;
import org.talend.designer.hdfsbrowse.model.EHDFSFileTypes;
import org.talend.designer.hdfsbrowse.model.EHadoopFileTypes;
import org.talend.designer.hdfsbrowse.model.HDFSFile;
import org.talend.designer.hdfsbrowse.model.IHDFSNode;
import org.talend.metadata.managment.ui.preview.ProcessDescription;
import org.talend.metadata.managment.ui.preview.ShadowProcessPreview;
import org.talend.metadata.managment.ui.utils.ConnectionContextHelper;
import org.talend.metadata.managment.ui.utils.ShadowProcessHelper;
import org.talend.repository.ProjectManager;
import org.talend.repository.hadoopcluster.service.IExtractSchemaService;
import org.talend.repository.hdfs.util.HDFSConstants;
import org.talend.repository.hdfs.util.HDFSModelUtil;
import org.talend.repository.model.hdfs.HDFSConnection;

/**
 * created by ycbai on 2014-5-29 Detailled comment
 * 
 */
public class ExtractTextFileSchemaService implements IExtractSchemaService<HDFSConnection> {

    private final static String DEFAULT_SHADOW_TYPE = "FILE_DELIMITED"; //$NON-NLS-1$

    private final static String DEFAULT_FILE_SERVER = "Localhost 127.0.0.1"; //$NON-NLS-1$

    private ClassLoader classLoader;

    public ExtractTextFileSchemaService(ClassLoader classLoader) {
        this.classLoader = classLoader;
    }

    @Override
    public List<MetadataColumn> extractColumns(HDFSConnection connection, IHDFSNode node) throws Exception {
        List<MetadataColumn> columns = new ArrayList<MetadataColumn>();
        if (connection == null || node == null || node.getType() != EHadoopFileTypes.FILE) {
            return columns;
        }

        HDFSFile file = (HDFSFile) node;
        file.setFileType(EHDFSFileTypes.TEXT);
        InputStream inputStream = HadoopOperationManager.getInstance()
                .getFileContent(HDFSModelUtil.convert2HDFSConnectionBean(connection), classLoader, file.getPath());

        return extractColumns(connection, inputStream, file.getTable().getName());
    }

    @Override
    public List<MetadataColumn> extractColumns(HDFSConnection connection, MetadataTable metadataTable)
            throws HadoopServerException, CoreException, IOException {
        List<MetadataColumn> columns = new ArrayList<MetadataColumn>();
        if (connection == null || metadataTable == null) {
            return columns;
        }
        EMap<String, String> additionalProperties = metadataTable.getAdditionalProperties();
        String hdfsPath = additionalProperties.get(HDFSConstants.HDFS_PATH);
        if (StringUtils.isEmpty(hdfsPath)) {
            return columns;
        }
        InputStream inputStream = HadoopOperationManager.getInstance()
                .getFileContent(HDFSModelUtil.convert2HDFSConnectionBean(connection), classLoader, hdfsPath);

        return extractColumns(connection, inputStream, metadataTable.getLabel());
    }

    private List<MetadataColumn> extractColumns(HDFSConnection connection, InputStream inputStream,
            String tmpFileName) throws CoreException, IOException {
        List<MetadataColumn> columns = new ArrayList<MetadataColumn>();
        if (connection == null || inputStream == null || tmpFileName == null) {
            return columns;
        }
        File tmpFile = createTmpFile(inputStream, tmpFileName);
        CsvArray csvArray = ShadowProcessHelper.getCsvArray(getProcessDescription(connection, tmpFile),
                DEFAULT_SHADOW_TYPE, true);
        return guessSchemaFromArray(csvArray, connection.isFirstLineCaption(), connection.getHeaderValue());
    }

    private ProcessDescription getProcessDescription(HDFSConnection connection, File tmpFile) throws IOException {
        ProcessDescription processDescription = new ProcessDescription();
        Charset guessedCharset = CharsetToolkit.guessEncoding(tmpFile, 4096);
        processDescription.setEncoding(TalendQuoteUtils.addQuotesIfNotExist(guessedCharset.displayName()));
        processDescription.setFieldSeparator(TalendQuoteUtils.addQuotesIfNotExist(connection.getFieldSeparator()));
        processDescription.setRowSeparator(TalendQuoteUtils.addQuotesIfNotExist(connection.getRowSeparator()));
        processDescription
                .setFilepath(TalendQuoteUtils.addQuotesIfNotExist(formatFilePath(tmpFile.getAbsolutePath())));
        processDescription.setFooterRow(0);
        int i = -1;
        if (connection.isUseHeader()) {
            i = ConnectionContextHelper.convertValue(connection.getHeaderValue());
            if (i != -1) {
                i--;
            }
        }
        processDescription.setHeaderRow(i);
        processDescription.setCSVOption(false);
        processDescription.setLimitRows(DEFAULT_READ_LINE_NUM);
        processDescription.setPattern(TalendQuoteUtils.addQuotesIfNotExist(connection.getFieldSeparator()));
        processDescription.setRemoveEmptyRow(false);
        processDescription.setServer(TalendQuoteUtils.addQuotesIfNotExist(DEFAULT_FILE_SERVER));
        processDescription.setSplitRecord(false);

        return processDescription;
    }

    private File createTmpFile(InputStream inputStream, String fileName) {
        return createTmpFile(inputStream, fileName, DEFAULT_READ_LINE_NUM);
    }

    /**
     * DOC ycbai Comment method "createTmpFile".
     * 
     * Create a temporary file which contents are readed from the inputStream.
     * 
     * @param inputStream the inputStream to read
     * @param fileName the name of temporary file
     * @param maxLineNum the max quantity lines to read. If is "-1" means not limit the quantity.
     * @return
     */
    private File createTmpFile(InputStream inputStream, String fileName, int maxLineNum) {
        Project project = ProjectManager.getInstance().getCurrentProject();
        IProject fsProject = null;
        try {
            fsProject = ResourceUtils.getProject(project);
        } catch (PersistenceException e2) {
            ExceptionHandler.process(e2);
        }
        if (fsProject == null) {
            return null;
        }
        File tmpParentFile = new File(
                fsProject.getLocationURI().getPath() + File.separator + "temp" + File.separator + "hdfs"); //$NON-NLS-1$ //$NON-NLS-2$
        File tmpfile = new File(tmpParentFile, fileName);
        BufferedReader reader = null;
        Writer writer = null;
        try {
            if (!tmpParentFile.exists()) {
                tmpParentFile.mkdirs();
            }
            if (tmpfile.exists()) {
                tmpfile.delete();
            }
            reader = new BufferedReader(new InputStreamReader(inputStream));
            writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tmpfile)));
            int totalLines = 0;
            String strLine = null;
            do {
                totalLines++;
                strLine = reader.readLine();
                if (strLine != null) {
                    writer.append(strLine);
                    writer.append("\r\n"); //$NON-NLS-1$
                }
            } while (strLine != null && (maxLineNum == -1 || totalLines < maxLineNum));
        } catch (Exception e) {
            ExceptionHandler.process(e);
        } finally {
            try {
                inputStream.close();
                if (reader != null) {
                    reader.close();
                }
                if (writer != null) {
                    writer.close();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        return tmpfile;
    }

    private String formatFilePath(String path) {
        if (path == null) {
            return ""; //$NON-NLS-1$
        }
        return path.replace("\\", "/"); //$NON-NLS-1$ //$NON-NLS-2$
    }

    public List<MetadataColumn> guessSchemaFromArray(final CsvArray csvArray, boolean isFirstLineCaption,
            String header) {
        List<MetadataColumn> columns = new ArrayList<MetadataColumn>();
        List<String> exisColumnNames = new ArrayList<String>();

        int headerValue = 0;
        if (StringUtils.isNotBlank(header)) {
            headerValue = Integer.parseInt(header);
        }

        if (csvArray == null) {
            return columns;
        } else {
            List<String[]> csvRows = csvArray.getRows();
            if (csvRows.isEmpty()) {
                return columns;
            }

            String[] fields = csvRows.get(0);
            int numberOfCol = getNumbersOfColumns(csvRows);

            // define the label to the metadata width the content of the first row
            int firstRowToExtractMetadata = headerValue;

            // the first rows is used to define the label of any metadata
            String[] label = new String[numberOfCol];
            for (int i = 0; i < numberOfCol; i++) {
                label[i] = DEFAULT_COLUMN_LABEL + i;
                if (isFirstLineCaption) {
                    if (numberOfCol <= fields.length) {// if current field size
                        // is greater than or
                        // equals bigest column
                        // size
                        if (fields[i] != null && !("").equals(fields[i])) { //$NON-NLS-1$
                            label[i] = fields[i].trim().replaceAll(" ", "_"); //$NON-NLS-1$ //$NON-NLS-2$
                            label[i] = MetadataToolHelper.validateColumnName(label[i], i);
                        } else {
                            label[i] = DEFAULT_COLUMN_LABEL + i;
                        }
                    } else {// current field size is less than bigest column
                        // size
                        if (i < fields.length) {
                            if (fields[i] != null && !("").equals(fields[i])) { //$NON-NLS-1$
                                label[i] = fields[i].trim().replaceAll(" ", "_"); //$NON-NLS-1$ //$NON-NLS-2$
                            } else {
                                label[i] = DEFAULT_COLUMN_LABEL + " " + i; //$NON-NLS-1$ 
                            }
                        } else {
                            label[i] = DEFAULT_COLUMN_LABEL + " " + i; //$NON-NLS-1$ 
                        }
                    }
                }
            }
            // fix bug 5694: column names check in FileDelimited wizard fails to
            // rename duplicate column name
            ShadowProcessPreview.fixDuplicateNames(label);

            for (int i = 0; i < numberOfCol; i++) {
                // define the first currentType and assimile it to globalType
                String globalType = null;
                int lengthValue = 0;
                int precisionValue = 0;

                int current = firstRowToExtractMetadata;
                while (globalType == null) {
                    // see the feature 6296,qli comment
                    if (current == csvRows.size()) {
                        globalType = "id_String";//$NON-NLS-1$
                        continue;
                    } else if (i >= csvRows.get(current).length) {
                        globalType = "id_String"; //$NON-NLS-1$
                    } else {
                        globalType = JavaDataTypeHelper.getTalendTypeOfValue(csvRows.get(current)[i]);
                        current++;
                    }
                }

                // for another lines
                for (int f = firstRowToExtractMetadata; f < csvRows.size(); f++) {
                    fields = csvRows.get(f);
                    if (fields.length > i) {
                        String value = fields[i];
                        if (!value.equals("")) { //$NON-NLS-1$
                            if (!JavaDataTypeHelper.getTalendTypeOfValue(value).equals(globalType)) {
                                globalType = JavaDataTypeHelper.getCommonType(globalType,
                                        JavaDataTypeHelper.getTalendTypeOfValue(value));
                            }
                            if (lengthValue < value.length()) {
                                lengthValue = value.length();
                            }
                            int positionDecimal = 0;
                            if (value.indexOf(',') > -1) {
                                positionDecimal = value.lastIndexOf(',');
                                precisionValue = lengthValue - positionDecimal;
                            } else if (value.indexOf('.') > -1) {
                                positionDecimal = value.lastIndexOf('.');
                                precisionValue = lengthValue - positionDecimal;
                            }
                        } else {
                            IPreferenceStore preferenceStore = null;
                            if (GlobalServiceRegister.getDefault()
                                    .isServiceRegistered(IDesignerCoreUIService.class)) {
                                IDesignerCoreUIService designerCoreUiService = (IDesignerCoreUIService) GlobalServiceRegister
                                        .getDefault().getService(IDesignerCoreUIService.class);
                                preferenceStore = designerCoreUiService.getPreferenceStore();
                            }
                            if (preferenceStore != null
                                    && preferenceStore
                                            .getString(MetadataTypeLengthConstants.VALUE_DEFAULT_TYPE) != null
                                    && !preferenceStore.getString(MetadataTypeLengthConstants.VALUE_DEFAULT_TYPE)
                                            .equals("")) { //$NON-NLS-1$
                                globalType = preferenceStore
                                        .getString(MetadataTypeLengthConstants.VALUE_DEFAULT_TYPE);
                                if (preferenceStore
                                        .getString(MetadataTypeLengthConstants.VALUE_DEFAULT_LENGTH) != null
                                        && !preferenceStore
                                                .getString(MetadataTypeLengthConstants.VALUE_DEFAULT_LENGTH)
                                                .equals("")) { //$NON-NLS-1$
                                    lengthValue = Integer.parseInt(preferenceStore
                                            .getString(MetadataTypeLengthConstants.VALUE_DEFAULT_LENGTH));
                                }
                            }
                        }
                    }
                }
                // see the feature 6296,qli comment
                if (csvRows.size() <= 1 && firstRowToExtractMetadata == 1) {
                    lengthValue = 255;
                }

                // define the metadataColumn to field i
                MetadataColumn metadataColumn = ConnectionFactory.eINSTANCE.createMetadataColumn();
                metadataColumn.setPattern("\"dd-MM-yyyy\""); //$NON-NLS-1$
                // Convert javaType to TalendType
                String talendType = globalType;
                if (globalType.equals(JavaTypesManager.FLOAT.getId())
                        || globalType.equals(JavaTypesManager.DOUBLE.getId())) {
                    metadataColumn.setPrecision(precisionValue);
                } else {
                    metadataColumn.setPrecision(0);
                }
                metadataColumn.setTalendType(talendType);
                metadataColumn.setLength(lengthValue);
                String columnLabel = IndiceHelper.getIndexedLabel(label[i], exisColumnNames);
                metadataColumn.setLabel(columnLabel);
                if (!exisColumnNames.contains(columnLabel)) {
                    exisColumnNames.add(columnLabel);
                }
                columns.add(i, metadataColumn);
            }
        }
        return columns;
    }

    private int getNumbersOfColumns(List<String[]> csvRows) {
        int numbersOfColumns = 0;
        int parserLine = csvRows.size();
        if (parserLine > 50) {
            parserLine = 50;
        }
        for (int i = 0; i < parserLine; i++) {
            if (csvRows.get(i) != null) {
                String[] nbRow = csvRows.get(i);
                if (nbRow.length >= numbersOfColumns) {
                    numbersOfColumns = nbRow.length;
                }
            }
        }
        return numbersOfColumns;
    }

}