package edu.unika.aifb.rdf.rdfcrawler;

import java.net.*;
import java.util.*;
import java.util.regex.*;

/**
 * HTMLInstance - process the metainfo extracted from the HTML document.
 * Initialize either by indicating URI (this results in a cache lookup)
 * or by passing a StringBuffer with the entire document.
 * (Probably we might want to implement initialization from Input streams as well).

 * There are unresolved problems with this class - see comment in function main()
 */
public class HTMLInstance {
    /** List of URIs found in this document */
    private Vector urilist;

    /** List of RDF namespaces found in this document */
    private Vector nslist;

    /** RDF fact accumulator */
    private Vector rdflist;

    private StringBuffer fstring;

    // the URL of the current document
    private String urlstring;
    // the same as an java.net.URL
    private URL base;

    /**
     * Initialize from StringBuffer
     */
    public HTMLInstance(URLStruct us, StringBuffer arg1) {
        this.urlstring = us.getURL();
        fstring = arg1;
        urilist = new Vector();
        nslist = new Vector();
        rdflist = new Vector();

        try {
            // build the URL, rely that it is syntactically correct (ensured by URLStruct.check)
            base = new URL(urlstring);
        } catch (Exception e) {
            System.err.println("Unexpectted error in HTMLInstance.HTMLInstance(): " + e.toString());
        }
    }

    /**
     * Find all the URIs from HTML file from tag patterns,
     * transform them into absolute ones and build vectors:
     * urilist --- all uris to be followed in the next level of crawling
     * rdflist --- list of generated RDF facts.
     *
     * Default tag patterns are the following:
     * <A HREF="...">
     * <AREA HREF="...">
     * <LINK HREF="...">
     * <FRAME SRC="...">
     * <IFRAME SRC="...">
     *
     * To accomodate lax HTML syntax, the pattern matches are case-insensitive,
     * double-quotes are not required, other attributes may be written in between, etc.
     * As an example we provide the regular expression to extract the pattern <A HREF="...">:
     * <a[\s\n][^<>]*?\bhref[\s\n]*=[\s\n]*["']?([^\s\n"'<>]+).*?>
     *
     */
    public void analyze() {
        Pattern exp=Pattern.compile("<(a|area|link|frame|iframe)\\s([^>]*\\s)?(href|src)\\s*=\\s*[\"']?([^\\s\"'<>]+)[\"'>]",Pattern.CASE_INSENSITIVE | Pattern.DOTALL);
        Matcher matcher=exp.matcher(fstring);
        // Theoretically we should check whether the combination of
        // tag and attribute is allowed, i.e. <a src="..."> is not allowed.
        // In fact we are not checking this.

        while (matcher.find()) {

            String res = matcher.group(4); // number 4 refers to the third pair of parentheses in the regular expression containing the absolute or relative URL
            URL newurl;
            try {
                newurl = new URL(base,res);
                urilist.add(newurl.toString());
            }
            catch (MalformedURLException igonre) {
            }
        }
    }

    public Vector getUri() {
        return urilist;
    }

    public Vector getNs() {
        return nslist;
    }

    public Vector getRdf() {
        return rdflist;
    }
}
