csv.parser.CSVParser.java Source code

Java tutorial

Introduction

Here is the source code for csv.parser.CSVParser.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package csv.parser;

/**
 *
 * @author best1yash
 */
import java.io.FileReader;
import java.util.Arrays;
import java.util.List;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Stack;
import java.util.*;
import com.opencsv.*;
import java.io.FileNotFoundException;
import org.apache.commons.lang3.ArrayUtils;

public class CSVParser {

    @SuppressWarnings("resource")
    public static void main(String[] args) throws Exception {
        String file_to_parse;
        String[] val_array;
        file_to_parse = "./input/E-library-data-3.csv";
        //Build reader instance
        //Read CSV file
        CSVReader reader = new CSVReader(new FileReader(file_to_parse), ';', '"', 1);

        //Read all rows at once
        List<String[]> allRows = reader.readAll();

        //        Read CSV line by line and use the string array as you want
        for (String[] row : allRows) {
            for (int i = 0; i < row.length; i++) {
                //Removing all newlines, tabs and '&' characters(invalid XML character)
                row[i] = row[i].replaceAll("(\\r|\\n|\\r\\n)+", " ");
                row[i] = row[i].replaceAll(System.getProperty("line.separator"), "; ");
                row[i] = row[i].replaceAll("&", "and");
            }

            System.out.println(Arrays.toString(row));
        }
        //Get the input fields
        List<String[]> map = getMap();
        String[] field;
        //Numbering for folders, folderNum is incremented for each new file
        long folderNum;
        folderNum = 0;
        for (String[] row : allRows) {
            //Creating new folder
            File file1 = new File("./output//newdir//folder" + folderNum + "");
            file1.mkdirs();
            //Creating content file
            PrintWriter writer_content = new PrintWriter("./output//newdir//folder" + folderNum + "//contents",
                    "UTF-16");
            //Creating metadata_lrmi.xml
            PrintWriter writer_lrmi = new PrintWriter(
                    "./output//newdir//folder" + folderNum + "//metadata_lrmi.xml", "UTF-16");
            //Creating content.xml
            PrintWriter writer = new PrintWriter("./output//newdir//folder" + folderNum + "//content.xml",
                    "UTF-16");
            writer.println("<?xml version=\"1.0\" encoding=\"utf-16\" standalone=\"no\"?>");
            writer.println("<dublin_core schema=\"dc\">");
            writer_lrmi.println("<?xml version=\"1.0\" encoding=\"utf-16\" standalone=\"no\"?>");
            writer_lrmi.println("<dublin_core shema=\"lrmi\">");
            for (int i = 0; i < row.length; i++) {
                //After snooping data, we have to change these setting for each new csv file, as the data fileds are many times mismatched
                //These if-else statements take care of mismatched steps.
                if (i == 43) {
                    continue;
                } else if (i == 43) {
                    field = map.get(42);
                } else if (i == 44) {
                    field = map.get(43);
                } else if (i == 45 || i == 46) {
                    continue;
                } else {
                    field = map.get(i);
                }
                //Separate multiple values
                val_array = parseVal(row[i]);
                //                if (val_array.length == 0) {
                //                    continue;
                //                }
                PrintWriter useWriter = writer;
                if (field[0].equals("lrmi")) {
                    useWriter = writer_lrmi;
                }
                switch (field.length) {
                case 2:
                    writeXML(useWriter, field[1], "", val_array);
                    break;
                case 3:
                    writeXML(useWriter, field[1], field[2], val_array);
                    break;
                default:

                }
            }
            folderNum++;
            writer.println("</dublin_core>");
            writer_lrmi.println("</dublin_core>");
            writer.close();
            writer_lrmi.close();
            writer_content.close();
        }
    }

    //Supportive function to write into xml files
    private static void writeXML(PrintWriter writer, String elem, String qual, String[] val_array) {
        //Different conditions to map E-library's format to our metadata format
        if (elem.equals("language")) {
            val_array = getLang(val_array);
        } else if (elem.equals("educationalAlignment") && qual.equals("educationalFramework")) {
            val_array = getEduFrmwrk(val_array);
        } else if (elem.equals("educationalAlignment") && qual.equals("educationalLevel")) {
            val_array = getEduLvl(val_array);
        } else if (elem.equals("learningResourceType")) {
            val_array = getLearningResourceType(val_array);
        } else if (elem.equals("format") && qual.equals("extent")) {
            val_array = getFormatExtent(val_array);
        } else if (elem.equals("format") && qual.equals("difficultylevel")) {
            val_array = getFormatDifficultyLevel(val_array);
        } else if (elem.equals("type") && qual.equals("")) {
            val_array = getType(val_array);
        } else if (elem.equals("subject") && qual.equals("keyword")) {
            val_array = getSubjectKeyword(val_array);
        } else if (elem.equals("timeRequired") && qual.equals("")) {
            val_array = getTimeRequired(val_array);
        }
        //Write in xml file
        for (int i = 0; i < val_array.length; i++) {
            //Skip if there is no entry for the field
            if (val_array[i].equals("") || val_array[i] == null) {
                continue;
            } else {
                String writer1 = "<dcvalue element =\"" + elem + "\"";
                if (qual != "") {
                    writer1 = writer1 + " qualifier=\"" + qual + "\"";
                }
                if (val_array[i] != "") {
                    writer1 = writer1 + ">" + val_array[i] + "</dcvalue>";
                } else {
                    writer1 = writer1 + " />";
                }
                writer.println(writer1);
            }
        }
    }

    //Prepare the field map first similar to map.csv given in input folder
    private static List getMap() throws FileNotFoundException, IOException {
        String file_to_parse;
        file_to_parse = "./input/map.csv";
        //Build reader instance
        CSVReader reader = new CSVReader(new FileReader(file_to_parse), '.');

        //Read all rows at once
        List<String[]> allRows = reader.readAll();
        for (String[] row : allRows) {
            for (int i = 0; i < row.length; i++) {
                row[i] = row[i].replaceAll("(\\r|\\n|\\r\\n|)+", "");
                row[i] = row[i].replaceAll("\\s+", "");
                row[i] = row[i].replaceAll(System.getProperty("line.separator"), "; ");
            }
        }
        return allRows;

    }
    //Convert the language to our format, add more if-else cluase for other languages

    private static String[] getLang(String[] val) {
        for (int i = 0; i < val.length; i++) {
            if (val[i].equals("English")) {
                val[i] = "eng";
            } else if (val[i].equals("Hindi")) {
                val[i] = "hin";
            } else if (val[i].equals("Bengali")) {
                val[i] = "ben";
            }
        }
        return val;
    }

    //Separate multiple values from string to array
    private static String[] parseVal(String val) {
        val = val.replaceAll("\"", "");
        val = val.replace("; |;", " ");
        String delimiter = ";";
        String[] val_array = val.split(delimiter);
        for (int i = 0; i < val_array.length; i++) {
            val_array[i] = val_array[i].trim();
        }
        return val_array;
    }

    private static String[] getEduLvl(String[] val) {
        for (int i = 0; i < val.length; i++) {
            if (val[i].equals("Under Graduate") || val[i].equals("Post Graduate")) {
                val[i] = "ug_pg";
            } else if (val[i].equals("I") || val[i].equals("II") || val[i].equals("III") || val[i].equals("IV")) {
                val[i] = "lowerPrimary";
            } else if (val[i].equals("V") || val[i].equals("VI") || val[i].equals("VII") || val[i].equals("VIII")) {
                val[i] = "upperPrimary";
            } else if (val[i].equals("IX") || val[i].equals("X")) {
                val[i] = "middleSchool";
            } else if (val[i].equals("XI") || val[i].equals("XII")) {
                val[i] = "highSchool";
            }
        }
        int end = val.length;
        Set<String> set = new HashSet<String>();
        for (int i = 0; i < end; i++) {
            set.add(val[i]);
        }
        val = set.toArray(new String[set.size()]);
        return val;
    }

    private static String[] getEduFrmwrk(String[] val) {
        for (int i = 0; i < val.length; i++) {
            if (val[i].equals("I.C.S.E")) {
                val[i] = "Indian Board of School Education";
            } else if (val[i].equals("C.B.S.E")) {
                val[i] = "Central Board of Secondary Education";
            }
        }
        return val;
    }

    private static String[] getLearningResourceType(String[] val_array) {
        String delimiter = "/";
        String[] new_val_array = val_array[0].split(delimiter);
        for (int i = 0; i < new_val_array.length; i++) {
            new_val_array[i] = new_val_array[i].trim();
            if (new_val_array[i].equals("Audio-Video Lecture")) {
                new_val_array[i] = "video";
            }
            if (new_val_array[i].equals("Tutorial")) {
                new_val_array[i] = "exercise";
            }
        }
        return new_val_array;
    }

    private static String[] getFormatExtent(String[] val_array) {
        val_array = ArrayUtils.removeElement(val_array, val_array[0]);
        return val_array;
    }

    private static String[] getFormatDifficultyLevel(String[] val_array) {
        for (int i = 0; i < val_array.length; i++) {
            val_array[i] = val_array[i].toLowerCase();
        }
        return val_array;
    }

    private static String[] getType(String[] val_array) {
        for (int i = 0; i < val_array.length; i++) {
            //"text, video, audio, image, presentation,application, animation, simulation""+ "
            val_array[i] = val_array[i].toLowerCase();
        }
        return val_array;
    }

    private static String[] getSubjectKeyword(String[] val_array) {
        List<String> toRemove = new ArrayList<>();
        for (String val : val_array) {
            if (val.contains(".")) {
                toRemove.add(val);
            }
        }
        for (String val : toRemove) {
            val_array = ArrayUtils.removeElement(val_array, val);
        }
        return val_array;
    }

    //Assumption, maximum time required is in hours only. Doesn't extends upto days.
    private static String[] getTimeRequired(String[] val_array) {
        String delimiter = ":";
        String[] time_array = val_array[0].split(delimiter);
        if (time_array.length >= 3) {
            String time = "PT" + time_array[0] + "H" + time_array[1] + "M" + time_array[2] + "S";
            String[] new_val_array = { time };
            val_array = new_val_array;
        }
        return val_array;
    }

}