package edu.unika.aifb.rdf.rdfcrawler;

import java.net.*;
import java.io.*;

/**
 * NetRetrieve - fetch URLs and write them to files
 */
public class NetRetrieve {
    protected static final File TEMP_RDF_DIRECTORY;
    static {
        File tempDirectory=new File(System.getProperty("java.io.tmpdir"));
        TEMP_RDF_DIRECTORY=new File(tempDirectory,"rdf");
        if (!TEMP_RDF_DIRECTORY.exists())
            TEMP_RDF_DIRECTORY.mkdirs();
    }

    public static String download(URLStruct us) throws MalformedURLException, IOException, FileNotFoundException {
        URL url = new URL( us.getURL() );
        URLConnection uc = url.openConnection();
        String referer = us.getParentURL();
        if ( referer != null && referer != "")
            uc.setRequestProperty("Referer", referer);
        uc.setRequestProperty("Accept", "text/html, text/xml, text/rdf, text/plain");
        uc.setRequestProperty("User-Agent", "RDF Crawler by kaa@aifb.uni-karlsruhe.de");
        InputStream f = uc.getInputStream();
        try {
            String extension = makeExtension(uc.getHeaderField("Content-type"));
            File file=new File(TEMP_RDF_DIRECTORY,url.hashCode()+"."+extension);
            FileOutputStream fos = new FileOutputStream(file);
            try {
                byte[] buf = new byte[4096];
                int len;
                while ((len = f.read(buf)) != -1) {
                    fos.write( buf, 0, len );
                }
            }
            finally {
                fos.close();
            }
            return file.toString();
        }
        finally {
            f.close();
        }
    }
    private static String makeExtension(String contentType) {
        // Cut off the front part of the contentType MIME,
        // e.g. discard "text/", if the full string is "text/html".
        int from = contentType.lastIndexOf("/");
        if (from == -1) return "txt";
        String result = contentType.substring(from+1);

        // check what remains:
        if (result.startsWith("html")) return "html";
        else if (result.startsWith("xml")) return "xml";
        else if (result.startsWith("rdf")) return "rdf";
        else if (result.startsWith("oil")) return "oil";
        else if (result.startsWith("daml")) return "daml";
        else return "txt";
    }
}
