wiki.link.LinkImporter.java Source code

Java tutorial

Introduction

Here is the source code for wiki.link.LinkImporter.java

Source

package wiki.link;

import org.springframework.jdbc.CannotGetJdbcConnectionException;
import wiki.DbConnector;
import wiki.doc.Doc;
import wiki.doc.DocMapper;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by Michael Kelley on 4/30/14.
 * See LICENSE file for license information.
 */
public class LinkImporter {
    public static void main(String[] args) {
        LinkImporter wr = new LinkImporter();
        String path = null;
        for (int i = 0; i < args.length; i++) {
            if ("path".equals(args[i]) && i + 1 < args.length) {
                path = args[i + 1];
            }
        }
        if (path == null) {
            System.out.println("Please specify the file path.");
            System.exit(1);
        }
        //wr.readXML("/mnt/dev/wiki/enwiki-20140203-pages-articles.xml");
        wr.readXML(path);
    }

    public boolean readXML(String filename) {

        //System.exit(1);
        //MAKE BLOODY SURE YOU HAVE SOME HOURS.
        DbConnector dbc = new DbConnector("localhost");
        dbc.jdbcTemplate.update("TRUNCATE links;");
        try {
            XMLInputFactory xif = XMLInputFactory.newInstance();
            XMLStreamReader xsr = xif.createXMLStreamReader(new FileInputStream(filename));

            long n = 0;
            List<Doc> toSave = new ArrayList<>();
            while (xsr.hasNext()) {
                xsr.next();
                if (xsr.getEventType() == XMLStreamReader.START_ELEMENT) {
                    if (xsr.getLocalName().equals("page")) {
                        long id = -1;
                        String title = null;
                        String text = null;
                        while (xsr.hasNext()) {
                            xsr.next();
                            if (xsr.getEventType() == XMLStreamReader.START_ELEMENT) {
                                if (xsr.getLocalName().equals("id") && id == -1) {
                                    id = Long.parseLong(xsr.getElementText());
                                }
                                if (xsr.getLocalName().equals("title")) {
                                    title = xsr.getElementText();
                                }
                                if (xsr.getLocalName().equals("text")) {
                                    text = xsr.getElementText();
                                }
                            } else if (xsr.getEventType() == XMLStreamReader.END_ELEMENT
                                    && xsr.getLocalName().equals("page")) {
                                break;
                            }
                        }
                        if (id != -1 && title != null && text != null) {
                            Doc wd = new Doc(id, title, text);
                            toSave.add(wd);
                            n++;
                            if (n % 1000 == 0) {
                                insertLinks(toSave, dbc);
                                //                                WikiDoc.insertAll(toSave, dbc);
                                System.out.println(n);
                                toSave.clear();
                            }
                        }
                    }
                }
            }
        } catch (IOException | XMLStreamException e) {
            e.printStackTrace();
        }
        return true;
    }

    private String getParamList(int n) {
        StringBuilder sb = new StringBuilder("(?");
        for (int i = 1; i < n; i++) {
            sb.append(", ?");
        }
        sb.append(")");
        return sb.toString();
    }

    private List<Doc> getPages(List<String> links, DbConnector dbc) {
        long n = links.size();
        // Max is actually in the area of 2^15, margin included for safety.
        // Limit is imposed by the JDBC driver limit on # of parameters for
        // a prepared statement.
        if (n <= 30000) {
            String[] linkArray = new String[links.size()];
            linkArray = links.toArray(linkArray);
            List<Doc> pages = dbc.jdbcTemplate.query(
                    "SELECT * FROM pages WHERE title IN " + getParamList(links.size()), linkArray, new DocMapper());
            return pages;
        } else {
            // Running into issues with stack space is a possibility here
            // But if the code is hundreds of levels of deep in 30000 element
            // lists, there should be some more thought as to why it's getting
            // 3,000,000+ pages for a set of links.
            List<Doc> first = getPages(links.subList(0, 30000), dbc);
            first.addAll(getPages(links.subList(30000, links.size()), dbc));
            return first;
        }
    }

    private void insertLinks(List<Doc> docs, DbConnector dbc) {
        ArrayList<Link> linksToSave = new ArrayList<>();
        for (Doc linkedFrom : docs) {
            ArrayList<String> links = linkedFrom.getTextLinks();

            if (links.size() == 0) {
                continue;
            }
            try {
                List<Doc> pages = getPages(links, dbc);
                for (Doc linkedTo : pages) {
                    Link wl = new Link(linkedFrom.id, linkedTo.id);
                    linksToSave.add(wl);
                }
            } catch (CannotGetJdbcConnectionException e) {
                try {
                    Thread.sleep(120000);
                } catch (InterruptedException e1) {
                    e1.printStackTrace();
                    System.exit(1);
                    //do nothing. should never be interrupted
                }
                insertLinks(docs, dbc);
                return;
            }
        }
        try {
            LinkResource.insertAll(linksToSave, dbc);
        } catch (CannotGetJdbcConnectionException e) {
            try {
                Thread.sleep(120000);
            } catch (InterruptedException e1) {
                e1.printStackTrace();
                System.exit(1);
                //do nothing. should never be interrupted
            }
            LinkResource.insertAll(linksToSave, dbc);
        }
    }
}