package edu.unika.aifb.rdf.rdfcrawler;

import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.Iterator;
import java.net.MalformedURLException;

/**
 * URIList uses URLStruct to track which URLs should be processed.
 */
public class URLList {
    protected static final Set s_processExtensions=new HashSet();
    static {
        s_processExtensions.add("");
        s_processExtensions.add("html");
        s_processExtensions.add("htm");
        s_processExtensions.add("xml");
        s_processExtensions.add("rdf");
        s_processExtensions.add("rdfs");
        s_processExtensions.add("daml");
    }

    protected Map m_urlMap;
    protected HostFilter m_filter;
    protected int[] m_statesCount;

    public URLList() {
        m_urlMap=new HashMap();
        m_statesCount=new int[4];
    }
    /**
     * Add a single URL with the given crawling depth and given parent
     * (this public method is called from Console in
     * a sinchronized manner, to add new URLs discovered by
     * all the crawling threads/channels).
     * Returns true, if insertion succeeded.
     */
    public void addURL(String url,String parentURL,int depth) {
        // Delete the reference part (beyond #) from url and parentURl
        url=cutRef(url);
        parentURL=cutRef(parentURL);
        if (!m_urlMap.containsKey(url)) {
            URLStruct us=new URLStruct(url,parentURL,depth);
            try {
                // if url or parent is syntactically incorrect, this throws an exception
                us.assertURLIsOK();
                // deal with file extension of the url
                if (!s_processExtensions.contains(us.getExtension()))
                    throw new FilterException("Unsupported extension '"+us.getExtension()+"'");
                if (m_filter!=null)
                    m_filter.assertIsInFilter(us.getHost());
            }
            catch (Exception e) {
                us.setStatus(URLStruct.ERROR);
                us.setException(e);
            }
            // add this to the list, even if it's an error
            m_urlMap.put(url,us);
            m_statesCount[us.getStatus()]++;
        }
    }
    /**
     * This method is used to add to the URLList if
     * depth is not known in advance
     */
    public void addURL(String url,String parentURL,boolean decrement) {
        URLStruct us=(URLStruct)m_urlMap.get(parentURL);
        int depth=us.getDepth();
        if (decrement)
            depth--;
        if (depth>=0)
            addURL(url,parentURL,depth);
    }
    public void setFilter(List hosts) throws MalformedURLException {
        m_filter=new HostFilter(hosts);
    }
    public boolean listProcessed() {
        return m_urlMap.size()==m_statesCount[URLStruct.ERROR]+m_statesCount[URLStruct.PROCESSED];
    }
    public URLStruct getUnprocessedURL() {
        Iterator iterator = m_urlMap.keySet().iterator();
        while (iterator.hasNext()) {
            String key=(String)iterator.next();
            URLStruct us=(URLStruct)m_urlMap.get(key);
            if (us.getStatus()==URLStruct.NOT_PROCESSED) {
                m_statesCount[URLStruct.NOT_PROCESSED]--;
                m_statesCount[URLStruct.NOT_PROCESSED]++;
                us.setStatus(URLStruct.BEING_PROCESSED);
                return us;
            }
        }
        return null;
    }
    public void markURLInError(URLStruct us,Exception e) {
        m_statesCount[us.getStatus()]--;
        us.setStatus(URLStruct.ERROR);
        us.setException(e);
        m_statesCount[URLStruct.ERROR]++;
    }
    public void markURLProcessed(URLStruct us) {
        m_statesCount[us.getStatus()]--;
        us.setStatus(URLStruct.PROCESSED);
        m_statesCount[URLStruct.PROCESSED]++;
    }
    public Iterator iterator() {
        return m_urlMap.values().iterator();
    }
    /**
     * Get back a nice RDF representation of what URLs
     * are placed in the list for crawling.
     */
    public String toString() {
        synchronized (this) {
            // XML header
            String result = "<?xml version='1.0' encoding='ISO-8859-1'?>" + "\n";
            result += "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#'" + "\n";
            result += "xmlns:a='http://TheCrawler#'>" + "\n";
            // print all top-level URLs with their successors
            result += getDescriptions(null);

            // Print also the elements of hostfilter in RDF format:
            if (m_filter != null) {
                result += m_filter.toString();
            }
            result += "</rdf:RDF>\n";
            return result;
        }
    }

    /**
     * Get back a nice representation of those URLs
     * which are being crawled into from the given URL parent
     * (parent=null for the top-level URLs)
     */
    public String getDescriptions(String parent) {
        String result = "";
        Iterator iterator = m_urlMap.keySet().iterator();
        while (iterator.hasNext()) {
            String key = (String)iterator.next();
            URLStruct us = (URLStruct)m_urlMap.get(key);
            if ((parent == null && us.getParentURL() == null)
                    || (parent != null && parent.equals(us.getParentURL()))) {
                result += us.openString();
                result += getDescriptions(us.getURL());
                result += us.closeString();
            }
        }
        return result;
    }
    /**
     * This function cuts away a reference part from a URL to
     * avoid duplication of URLs when crawling, in case if
     * they differ only in their reference part.
     */
    public static String cutRef(String url) {
        if (url == null) return null;
        int i = url.indexOf("#");
        if (i >= 0) return url.substring(0,i);
        else return url;
    }
}
