package edu.unika.aifb.rdf.rdfcrawler;

import java.io.*;
import java.util.*;

/**
 * DocInstance - call different document processing routines.
 * They are grouped in two classes - RDFInstance and HTMLInstance.
 * The goal or these manipulations is to obtain three things:
 * <ol>
 * <li>build RDF models from one or more fragments of RDF found within
 *     the document</li>
 * <li>extract the list of namespaces used within RDF</li>
 * <li>extract the list of URI references from both RDF and HTML parts of the document</li>
 * </ol>
 */
public class DocInstance {

    /**
     * To communicate with cache we need to know both the
     * URI of the current document and its referrer
     */
    private URLStruct url;

    /** List of URIs found in this document */
    private Vector urilist;

    /** List of RDF namespaces found in this document */
    private Vector nslist;

    /** RDF fact accumulator */
    private Vector rdflist;

    /**
     * The contents of the document;
     * various portions of it are cut and processed
     */
    private StringBuffer fstring;

    /** Local reference to the global document cache */
    private Cache cache;

    /** used to find Literal instances */
    private static Class ResourceClass;

    /**
     * Constructor: initialize data structures,
     * but dont assign current and parent URI, as they cause exceptions
     */
    public DocInstance(Cache cache, URLStruct us) throws FileNotFoundException, IOException {
        this.cache = cache;
        url = us;

        // get from cache or download and put into StringBuffer
        fstring = cache.readAsString(url);

        urilist = new Vector();
        nslist = new Vector();
        rdflist = new Vector();
    }

    /**
     * Analyze the HTML text and find out the outgoing URLs
     */
    public void analyzeHTML() {
        HTMLInstance hinst = new HTMLInstance(url,fstring);
        hinst.analyze();
        urilist.addAll(hinst.getUri());
        nslist.addAll(hinst.getNs());
        rdflist.addAll(hinst.getRdf());
    }

    /**
     * Analyze the RDF text and find out uris, namespaces and rdflists
     */
    public void analyzeRDF() throws Exception {
        RDFInstance rinst = new RDFInstance(url, fstring);
        rinst.analyze();
        urilist.addAll(rinst.getUri());
        nslist.addAll(rinst.getNs());
        rdflist.addAll(rinst.getRdf());
    }

    public Vector getUri() {
        return urilist;
    }

    public Vector getNs() {
        return nslist;
    }

    public Vector getRdf() {
        return rdflist;
    }

    /**
     * This method is adapted from org.gjt.vinny.html.HTMLEncoder,
     * it is a utility method for converting
     * a string into a format suitable for placing inside HTML,
     * so that special symbols: <,>,&," and ' are properly escaped.
     */
    public static String encode(String val) {
        if (val != null) {
            StringBuffer buf = new StringBuffer(val.length() + 8);
            char c;
            for(int i = 0; i < val.length(); i++) {
                c = val.charAt(i);
                switch(c) {
                case '<':
                    buf.append("&lt;");
                    break;
                case '>':
                    buf.append("&gt;");
                    break;
                case '&':
                    buf.append("&amp;");
                    break;
                case '\"':
                    buf.append("&quot;");
                    break;
                case '\'':
                    buf.append("&apos;");
                    break;
                default:
                    buf.append(c);
                    break;
                }
            }
            return buf.toString();
        } else {
            return "";
        }
    }
}
