package edu.unika.aifb.rdf.rdfcrawler;

import java.io.*;
import java.util.*;
import java.util.regex.*;
import org.xml.sax.*;

import edu.unika.aifb.rdf.api.model.*;
import edu.unika.aifb.rdf.api.syntax.*;
import edu.unika.aifb.rdf.api.util.*;

/**
 * HTMLInstance - process the metainfo extracted from the HTML document.
 * Initialize either by indicating URI (this results in a cache lookup)
 * or by passing a StringBuffer with the entire document.
 * (Probably we might want to implement initialization from Input streams as well).
 */
public class RDFInstance {
    protected static final Set s_rdfExtensions=new HashSet();
    static {
        s_rdfExtensions.add("xml");
        s_rdfExtensions.add("rdf");
        s_rdfExtensions.add("rdfs");
        s_rdfExtensions.add("daml");
    }

    /** Contents of the file as a StringBuffer */
    private StringBuffer fstring;

    /** url of this resource */
    private URLStruct url;

    /** List of URIs found in this document */
    private Vector urilist;

    /** List of RDF namespaces found in this document */
    private Vector nslist;

    /** RDF fact accumulator */
    private Vector rdflist;

    /**
     * Initialize from StringBuffer
     */
    public RDFInstance(URLStruct us, StringBuffer arg1) {
        this.url = us;
        fstring = arg1;
        urilist = new Vector();
        nslist = new Vector();
        rdflist = new Vector();
    }

    public void analyze() throws Exception {
        if (s_rdfExtensions.contains(url.getExtension()))
            processRDF(fstring.toString());
        else {
            // regular expression to cut the RDF fragments
            Pattern exp  = Pattern.compile("<(\\w+:)?RDF[^<>]*?>.*?</(\\w+:)?RDF[^<>]*?>",Pattern.DOTALL);
            Matcher matcher = exp.matcher(fstring);
            while (matcher.find()) {
                processRDF(matcher.group());
            }
        }
    }

    protected void processRDF(String rdf) throws Exception {
        RDFParser rdfParser=RDFManager.createParser();
        // Create a source for XML parsing
        Model model=RDFManager.createModel("file:///c:/dummy",null);
        InputSource inputSource=new InputSource(new StringReader(rdf));
        inputSource.setSystemId("file:///c:/dummy");
        rdfParser.parse(inputSource,new ModelConsumer(model));

        // add the facts from the model to rdfvect collection
        Iterator iterator=model.iterator();
        while (iterator.hasNext())
            rdflist.add(iterator.next());

        // extract all URIs and namespaces from the model
        for (Iterator en = model.iterator(); en.hasNext();) {
            Statement s = (Statement)en.next();
            urilist.add(s.subject().getURI());
            nslist.add(URLList.cutRef(s.predicate().getURI()));
            if (s.object() instanceof Resource)
                urilist.add(((Resource)s.object()).getURI());
        }
    }

    public Vector getUri() {
        return urilist;
    }

    public Vector getNs() {
        return nslist;
    }

    public Vector getRdf() {
        return rdflist;
    }
}
