#!/bin/bash

# Exit on error.
set -e

# Sort order is ASCII
export LC_LOCALE="C"

function log { echo " $(date $DATE)" "$@" ; }

TMP=$$
#DATE="+%Y-%m-%dT%H:%M:%S%:z"
DATE="+%H:%M:%S"
CP="$(make_classpath $TDBROOT)"
USAGE="Usage: $(basename $0) --loc location datafile ..."
PKG=com.hp.hpl.jena.tdb.store.bulkloader2

if [ "$#" -lt 2 ] ; then echo "$USAGE" 1>&2 ; exit 1 ; fi

## Process --loc. Yuk.
ARG1="$1"
shift
if [ "$ARG1" = "-loc" -o  "$ARG1" = "--loc" ]
then
    LOC="$1"
    shift
else 
    LOC="${ARG1/-*loc=/}"
    if [ "$ARG1" = "$LOC" ] ; then echo $USAGE 1>&2 ; exit 1 ; fi
fi

if [ ! -e "$LOC" ] ; then mkdir "$LOC" ; fi
if [ ! -d "$LOC" ] ; then echo "Not a directory: $LOC" ; exit 1 ; fi

FILES="$@"
## Stdin?
KEEPWORKFILES="${KEEPWORKFILES:-}"
# ---- Start
log "-- TDB Bulk Loader Start"
TIME1="$(date +%s)"

# ---- Data loading phase
log "Data phase"
# Produce nodes file and triples/quads text file.

DATA_TRIPLES="$LOC/data-triples.$TMP"
DATA_QUADS="$LOC/data-quads.$TMP"

java -Xmx1200M -cp "$CP" -server "$PKG".CmdNodeTableBuilder \
    "--loc=$LOC" "--triples=$DATA_TRIPLES" "--quads=$DATA_QUADS" $FILES

# ---- Index intermediates
## All files are writtern S P O / G S P O columns per row but in different sort orders.
log "Index phase"

function process_rows
{
    local KEYS="$1"
    local DATA="$2"
    local IDX=$3
    local WORK="$LOC/$IDX-txt"

    if [ ! -s "$DATA" ]
    then
	return
	fi

    log "Index $IDX"
    sort -u $KEYS < "$DATA" > $WORK
    log "Build $IDX"
    rm -f "$LOC/$IDX.dat"
    rm -f "$LOC/$IDX.idn"
    java -cp "$CP" -server "$PKG".CmdIndexBuild "$LOC" "$IDX" "$WORK"
    # Remove intermediary file.
    if [ "$KEEPWORKFILES" != "yes" ] 
    then
	rm "$WORK" 
    fi
}

K1="-k 1,1"
K2="-k 2,2"
K3="-k 3,3"
K4="-k 4,4"

process_rows "$K1 $K2 $K3" "$DATA_TRIPLES" SPO

process_rows "$K2 $K3 $K1" "$DATA_TRIPLES" POS

process_rows "$K3 $K1 $K2" "$DATA_TRIPLES" OSP

process_rows "$K1 $K2 $K3 $K4" "$DATA_QUADS" GSPO

process_rows "$K1 $K3 $K4 $K2" "$DATA_QUADS" GPOS

process_rows "$K1 $K4 $K2 $K3" "$DATA_QUADS" GOSP

process_rows "$K2 $K3 $K4 $K1" "$DATA_QUADS" SPOG

process_rows "$K3 $K4 $K2 $K1" "$DATA_QUADS" POSG

process_rows "$K4 $K2 $K3 $K1" "$DATA_QUADS" OSPG

log "Index phase end"
TIME2="$(date +%s)"

# ---- Clean up.

if [ "$KEEPWORKFILES" != "yes" ] 
then
    rm -f "$DATA_TRIPLES" "$DATA_QUADS" 
fi

# ---- End
log "-- TDB Bulk Loader Finish"
ELAPSED=$(($TIME2-$TIME1))
log "-- $ELAPSED seconds"
