/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */
package communities.parseciteceer;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * @author Владимир
 */
public class ParserCiteSeer {

    public static Elements getElementsPublication(Document doc) {
        Elements tablePublication = doc.getElementsByAttributeValue("class", "refs");
        if (tablePublication.isEmpty()) {
            Elements emtyListElemenrs = new Elements();
            return emtyListElemenrs;
        }
        Elements listPublication = tablePublication.get(0).getElementsByAttribute("href");
        return listPublication;
    }

    public static String getAuthorName(Elements element) {
        return element.get(0).getElementsByTag("h2").get(0).childNode(0).toString();
    }

    public static String getAuthorAfil(Elements viewsAuthor) {
        Elements affil = viewsAuthor.get(0).getElementsContainingOwnText("Affiliation");
        if (!affil.isEmpty()) {
            return affil.get(0).parent().getElementsByTag("td").get(1).childNode(0).toString();
        }
        return "";
    }

    public static String getCountPublicactions(Elements viewsAuthor) {
        Elements countPublications = viewsAuthor.get(0).getElementsContainingOwnText("Publications");
        if (!countPublications.isEmpty()) {
            return countPublications.get(countPublications.size() - 1).parent().
                    getElementsByTag("td").get(1).childNode(0).toString();
        }
        return "0";
    }

    public static void parseAuthors() throws Exception {
        Database database = new Database();
        for (int i = 20441; i < 310000; ++i) {
            try {
                String URL = "http://citeseer.ist.psu.edu/viewauth/summary?aid=" + i
                        + "&list=full";
                org.jsoup.nodes.Document doc = Jsoup.connect(URL).get();
                Elements viewsAuthor = doc.getElementsByAttributeValue("class", "viewAuth");
                if (viewsAuthor.isEmpty()) {
                    continue;
                }
                String nameAuthor = getAuthorName(viewsAuthor);
                String affiliation = getAuthorAfil(viewsAuthor);
                if (affiliation.length() >= 255) {
                    System.out.println(i + "affilation не подходит");
                    continue;
                }
                database.insertInTableAuthors(nameAuthor, String.valueOf(i));
                database.insertInAffiliations(affiliation);
                database.insertInAuthorAffiliation(nameAuthor, affiliation);
                Elements listPublications = getElementsPublication(doc);
                for (org.jsoup.nodes.Element publication : listPublications) {
                    if (publication.childNodes().isEmpty()) {
                        continue;
                    }

                    String link_publication = publication.attr("href");
                    int index = link_publication.indexOf('?');
                    link_publication = "http://citeseer.ist.psu.edu/viewdoc/summary"
                            + (String) link_publication.subSequence(index, link_publication.length())
                            + "&list=full";
                    String title_publication = parseTitleAriticle(link_publication, true);
                    database.insertInTableAuthorsPublication(nameAuthor, title_publication);
                }
                int stop = 0;
                System.out.println(i);
            } catch (IOException ex) {
                i--;
                continue;
            } finally {
                database.closeConnection();
            }
        }
    }

    public static class Counter extends Thread {
        private volatile boolean stopped = false;
        Element citationArticles;
        String nameArticles;

        public Counter(String nameArticles, Element el) {
            citationArticles = el;
            this.nameArticles = nameArticles;
        }

        @Override
        public void run() {
            try {
                Database database = new Database();
                while (!stopped) {
                    Elements title = citationArticles.getElementsByAttribute("href");
                    for (Element title_citation : title) {
                        String link = title_citation.attr("href");
                        if (title_citation.getElementsByAttributeValue("class", "citation_only").isEmpty()) {
                            try {
                                int index = link.indexOf("?cid");
                                link = "http://citeseer.ist.psu.edu/viewdoc/summary" + (String) link.subSequence(index, link.length());
                                String title_article_citation = parseTitleAriticle(link, false);
                                database.insertInTableCitations(nameArticles, title_article_citation);
                                stopped = true;
                            } catch (Exception ex) {
                                Logger.getLogger(ParserCiteSeer.class.getName()).log(Level.SEVERE, link, ex);
                            } finally {
                                database.closeConnection();
                            }
                        }
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    public static void parseCitations(Document doc, String nameArticles) throws Exception {
        Elements tableCitationsArticles = doc.getElementsByAttributeValue("class", "refs");
        Database database = new Database();
        try {
            for (Element citationArticles : tableCitationsArticles) {
                // Counter count = new Counter(nameArticles, citationArticles);
                // count.start();
                Elements title = citationArticles.getElementsByAttribute("href");
                for (Element title_citation : title) {
                    String link = title_citation.attr("href");
                    if (title_citation.getElementsByAttributeValue("class", "citation_only").isEmpty()) {
                        int index = link.indexOf("?cid");
                        link = "http://citeseer.ist.psu.edu/viewdoc/summary" + (String) link.subSequence(index, link.length());
                        String title_article_citation = parseTitleAriticle(link, false);
                        database.insertInTableCitations(nameArticles, title_article_citation);
                    }
                }
            }
        } finally {
            database.closeConnection();
        }

    }

    public static String parseTitleAriticle(String URL, boolean isParseCitations) throws IOException, Exception {
        System.out.println(URL);
        org.jsoup.nodes.Document doc = Jsoup.connect(URL).get();
        String publisher = null;
        String article_describe = null;
        String article_title = null;
        String article_year = "0";
        String article_authors = null;
        Elements title_citation = doc.getElementsByAttributeValue("name", "citation_title");
        if (title_citation.isEmpty()) {
            return null;
        }
        article_title = title_citation.get(0).attr("content");

        Database database = new Database();
        try {
            if (!database.isArticleInDatabase(article_title)) {
                Elements description_citation = doc.getElementsByAttributeValue("name", "description");
                for (org.jsoup.nodes.Element entryTagsEl : description_citation) {
                    article_describe = entryTagsEl.attr("content");
                    article_describe = article_describe.substring(77, article_describe.length());
                }
                Elements year_citation = doc.getElementsByAttributeValue("name", "citation_year");
                for (org.jsoup.nodes.Element entryTagsEl : year_citation) {
                    article_year = entryTagsEl.attr("content");
                }
                Elements authors_citation = doc.getElementsByAttributeValue("name", "citation_authors");
                for (org.jsoup.nodes.Element entryTagsEl : authors_citation) {
                    article_authors = entryTagsEl.attr("content");
                }
                Elements publishers = doc.getElementsByAttributeValue("name", "citation_conference");
                for (org.jsoup.nodes.Element epublisher : publishers) {
                    publisher = epublisher.attr("content");
                }
                String newURL = doc.getElementsByAttributeValue("id", "docMenu").get(0).
                        getElementsByAttributeValue("class", "active").get(0).child(0).attr("href");
                int index = newURL.indexOf("?doi");
                URL = newURL.substring(index + 5, newURL.length());
                database.insertInTablePapers(article_title, article_authors,
                        Integer.parseInt(article_year), publisher, URL, article_describe);
            }
            if (isParseCitations) {
                parseCitations(doc, article_title);
            }
        } finally {
            database.closeConnection();
        }
        return article_title;
    }
}
