package edu.unika.aifb.rdf.rdfcrawler;

import java.io.IOException;
import java.io.File;
import java.io.FileWriter;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Date;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;

import edu.unika.aifb.rdf.api.model.*;
import edu.unika.aifb.rdf.api.util.*;

/**
 * <p>RDFCrawler is intended as the only public class to be used
 * by every application which needs to embed RDF Crawler functionality.
 * If you are the "enduser" of RDF Crawler, you should not directly use
 * any other classes from the package.</p>
 *
 * <p>It initializes several main modules
 * and keeps static references from these modules
 * as well as implements public methods to interact with these modules.
 * This class is responsible for RDF model export and
 * for logging of all the actions taken by the crawler.</p>
 *
 * <p>An overview of other classes in the RDF Crawler (if you decide to use or change them):</p>
 * <dl>
 * <dt>URLList, URLStruct, HostFilter, FilterException</dt>
 *     <dd>urlList maintains dynamic list of URIs together with crawling information, exceptions etc.</dd>
 * <dt>ChannelPool, Channel</dt>
 *     <dd>thread pool (ChannelPool) which initializes threads (Channels) processing individual URLs</dd>
 * <dt>Cache, NetRetrieve</dt>
 *     <dd>Caching and networking</dd>
 * <dt>DocInstance, HTMLInstance, RDFInstance</dt>
 *     <dd>Document processing - accumulates info on URLs to follow, namespaces and RDF facts</dd>
 * </dl>
 */
public class RDFCrawler {
    /** All the URLs we have to crawl. Threads concurrently update this. */
    protected URLList m_urlList;
    /**
     * Cache of mappings: URLs. Currently we cache all the retrieved documents.
     * Even if we decide not to cache everything, it might be useful to cache frequently used
     * ontologies, etc.
     */
    protected Cache m_cache;
    /** Thread pool - branches off 10 different threads */
    protected ChannelPool m_pool;
    /** Depth of crawling. */
    protected int m_depth;
    /**
     * How many seconds to crawl. Value "-1" means - time unlimited.
     * Actually we might want to provide a new method "stop()" in this
     * crawler, so that the user may stop RDFCrawler at any time and
     * dump the results.
     */
    protected int m_time;
    /** RDF model - we are building it from small pieces */
    protected Model m_model;
    /**
     * How many threads in the ThreadPool
     * Feel free to change this for optimum performance
     */
    protected final int m_capacity=1;
    /** URL where to store the LOG file of the crawling process */
    protected URI m_logURL;
    /** URL where to store the output model of all the RDF facts */
    protected URI m_outputURL;
    /** URL where to store the cache map */
    protected URI m_cacheURL;
    /** Address of the proxy host. */
    protected String m_proxyHost;
    /** Number of the proxy port. */
    protected int m_proxyPort;

    /**
     * <p>Initialize the crawler parameters</p>
     * <dl>
     * <dt>uris</dt>
     *      <dd>String list of initial URIs to crawl to</dd>
     * <dt>hostfilter</dt>
     *      <dd>String list of hosts we want to crawl (null, if we crawl everywhere)</dd>
     * <dt>depth</dt>
     *      <dd>how deep we want to crawl (0, if we want just the given URIs)</dd>
     * <dt>time</dt>
     *      <dd>how many seconds we wait until we break connections to nonresponding hosts</dd>
     * </dl>
     */
    public RDFCrawler() {
        m_urlList=new URLList();
        m_cache=new Cache();
    }
    public void addInitialURLs(List urls) {
        Iterator iterator=urls.iterator();
        while (iterator.hasNext()) {
            String url=(String)iterator.next();
            m_urlList.addURL(url,null,m_depth);
        }
    }
    public void setHostFilter(List hostFilter) throws MalformedURLException {
        m_urlList.setFilter(hostFilter);
    }
    public void setDepth(int depth) {
        m_depth=depth;
    }
    public void setTime(int time) {
        m_time=time;
    }
    public void setCacheURL(URI cacheURL) {
        m_cacheURL=cacheURL;
    }
    public void setLogURL(URI logURL) {
        m_logURL=logURL;
    }
    public void setOutputURL(URI outputURL) {
        m_outputURL=outputURL;
    }
    public void setProxyHost(String proxyHost) {
        m_proxyHost=proxyHost;
    }
    public void setProxyPort(int proxyPort) {
        m_proxyPort=proxyPort;
    }
    public void loadConfiguration(Configuration configuration) throws MalformedURLException,URISyntaxException {
        setDepth(configuration.getDefaultDepth());
        setTime(configuration.getTimeOut());
        setCacheURL(new URI(configuration.getCacheURL()));
        setLogURL(new URI(configuration.getLogURL()));
        setOutputURL(new URI(configuration.getOutputURL()));
        setProxyHost(configuration.getProxyHost());
        setProxyPort(configuration.getProxyPort());
        addInitialURLs(configuration.getProviderList());
        if (configuration.getDomainFilter()!=null)
            setHostFilter(configuration.getDomainFilter());
    }
    public Model getModel() {
        return m_model;
    }
    /**
     * Start Crawling. All the RDFCrawler does is -
     * initialize and start the ChannelPool.
     * All the actual work - crawling, RDF generation etc.
     * is done by 10 participant threads in the ChannelPool.
     */
    public void crawl() throws Exception {
        if (m_proxyHost!=null) {
            System.setProperty("proxyHost",m_proxyHost);
            System.setProperty("proxyPort",String.valueOf(m_proxyPort));
        }
        m_model=RDFManager.createModel("file:///c:/temp/dummy",null);
        m_pool=new ChannelPool(m_model,m_urlList,m_cache,m_capacity,m_time);
        m_pool.crawl();
    }
    /**
     * Saves the URI log.
     */
    public void saveURILog() throws IOException {
        FileWriter output=new FileWriter(new File(m_logURL));
        try {
            output.write(m_urlList.toString());
        }
        finally {
            output.close();
        }
    }
    /**
     * Saves the cache log.
     */
    public void saveCacheLog() throws IOException {
        FileWriter output=new FileWriter(new File(m_cacheURL));
        try {
            output.write(m_cache.toString());
        }
        finally {
            output.close();
        }
    }
    /**
     * Saves the model.
     */
    public void saveModel() throws IOException {
        try {
            RDFUtil.writeModel(m_model,new File(m_outputURL),RDFUtil.DEFAULT_ENCODING);
        }
        catch (Exception e) {
            IOException ioe=new IOException("Error saving model");
            ioe.initCause(e);
            throw ioe;
        }
    }
    /**
     * Write out the results
     */
    public void writeResults() throws IOException {
        if (m_logURL!=null)
            saveURILog();
        if (m_cacheURL!=null)
            saveCacheLog();
        if (m_outputURL!=null)
            saveModel();
    }
    /**
     * Starts crawling with given arguments.
     *
     * @param args
     */
    public Model doCrawl(String[] args) throws Exception {
        if (args.length<1) {
            System.err.println("Usage: edu.unika.aifb.rdf.rdfcrawler.RDFCrawler (<uri> [depth [<time>]) | (-C <configuration_URI>)");
            return null;
        }
        if ("-C".equals(args[0]) && args.length>=2) {
            Configuration configuration=new Configuration(args[1]);
            loadConfiguration(configuration);
        }
        else {
            setDepth(getArgument(args,1,2));
            setTime(getArgument(args,2,60));
            setOutputURL(new URI("file:///c:/crawlmodel.rdf"));
            List uris=new ArrayList();
            uris.add(args[0]);
            addInitialURLs(uris);
        }
        System.out.println("Starting RDFCrawler with following parameters:");
        System.out.println("    Number of channels: "+m_capacity);
        System.out.println("    Crawling depth: "+m_depth);
        System.out.println("    Log URL: "+m_logURL);
        System.out.println("    Cache URL: "+m_cacheURL);
        System.out.println("    RDF output model URL: "+m_outputURL);
        System.out.println("Starting URL list:");
        Iterator iterator=m_urlList.iterator();
        while (iterator.hasNext()) {
            URLStruct urlStruct=(URLStruct)iterator.next();
            System.out.println(urlStruct.getURL());
        }
        System.out.println("---- Crawling started at: "+new Date());
        crawl();
        writeResults();
        System.out.println("---- Crawling finished at: "+new Date());
        return getModel();
    }
    /**
     * Used to call RDFCrawler from DOS command line.
     * Normally RDFCrawler is instantiated from elsewhere - a Windows interface
     * or an application which embeds the RDF Crawler.
     */
    public static void main(String[] args) throws Exception {
        RDFCrawler crawler=new RDFCrawler();
        crawler.doCrawl(args);
    }
    protected static int getArgument(String[] args,int index,int defaultValue) {
        int result=defaultValue;
        if (index<args.length)
            try {
                result=Integer.parseInt(args[index]);
            }
            catch (NumberFormatException e) {
                System.out.println("Incorrect number formagt: "+args[index]);
            }
        return result;
    }
}
