package de.fzi.wim.trie.extractor.syntax;

import java.util.Vector;
import de.fzi.wim.trie.extractor.*;

/**
 * Class to create the Syntax tokens from a string.
 *
 * @author <a href="zach@fzi.de">Valentin Zacharias</a>
 */
public class SyntaxTokenFactory {


    /**
     * Removes superficous whitespaces from a string, but for special meaning
     * whitespaces (like punctuations, new Lines, tabs) tokens will be created.
     */
    public static String createSyntaxTokens(String oldString, Vector tokens) {
        return createSyntaxTokens(oldString,tokens,true,true);
    }


    /**
     * Removes superficous whitespaces from a string, but for special meaning
     * whitespaces (like punctuations, new Lines, tabs) tokens will be created.
     *
     * @param specials if special characters should be recognized (this may decrease learning performance).
     * @param ends set this to true if you want begin and end to be created.
     */
    public static String createSyntaxTokens(String oldString, Vector tokens, boolean specials, boolean ends) {
        char[] oldC = oldString.toCharArray();
        char[] newC = new char[oldC.length];
        int oldI=0, newI=0;
        boolean wasWhitespace = false;
        Character current= null;
        if (ends) tokens.addElement(new GenericToken(0,0,SyntaxToken.begin));
        for (oldI=0;oldI<oldC.length;oldI++) {
            current = new Character(oldC[oldI]);
            if (Character.isWhitespace(oldC[oldI])) {
                wasWhitespace = true;
                if (isSpecialCharacter(oldString, oldI,specials)) tokens.addElement(new GenericToken(newI,newI, SyntaxToken.special));
                makeGenericTokens(current,newI,tokens);
            }
            else {
                if (wasWhitespace) {
                    wasWhitespace = false;
                    newC[newI] = ' ';
                    newI++;
                }
                if (isSpecialCharacter(oldString, oldI,specials)) tokens.addElement(new GenericToken(newI,newI, SyntaxToken.special));
                if (isEndOfSentence(oldString,oldI)) tokens.addElement(new GenericToken(newI,newI,SyntaxToken.eos));
                makeGenericTokens(current,newI,tokens);
                newC[newI] = oldC[oldI];
                newI++;
            }
        }
        if (ends) tokens.addElement(new GenericToken(newI-1,newI-1,SyntaxToken.end));
        char[] toReturnC = new char[newI];
        for (int i=0;i<newI;i++) toReturnC[i] = newC[i];
        return new String(toReturnC);
    }

    private static void makeGenericTokens(Character c, int index, Vector tokens) {
        SyntaxToken st = (SyntaxToken) SyntaxToken.tokenForCharacter.get(c);
        if (st != null) {
            tokens.addElement(new GenericToken(index,index,st));
        }
    }

    /**
     * Returns true if the char at the given index represents a "special" character (like / or &amp;)
     */
    private static boolean isSpecialCharacter(String string, int i,boolean specials) {
        if (!specials) return false;
        if (Character.isLetterOrDigit(string.charAt(i))) return false;
        else if (Character.isWhitespace(string.charAt(i))) return false;
        else {
            Character c= new Character(string.charAt(i));
            if (SyntaxToken.tokenForCharacter.get(c) == null) {
                return true;
            }
            else {
                return false;
            }
        }
    }

    /**
     * Returns true if the current character represents and of of sentence.
     * Here a very defensive approach is taken, i.e. for a high precision
     * low recall/recognition is accepted
     */
    private static boolean isEndOfSentence(String string, int i) {
        if (string.charAt(i) == '.') {
            try {
                if (Character.isDigit(string.charAt(i-1))) return false;
                if (Character.isDigit(string.charAt(i+1))) return false;
                if (string.charAt(i+1) == ' ') i++;
                if (Character.isLetter(string.charAt(i+1))) {
                    if (Character.isUpperCase(string.charAt(i+1))) {
                        return true;
                    }
                }
                return false;
            } catch (IndexOutOfBoundsException ioobe) {
                return false;
            } catch (NullPointerException npe) {
                return false;
            }
        }
        else {
            return false;
        }
    }

}
