/*
 * This file is part of Mable+, a program for checking MAB data for errors.
 *
 * Copyright (C) 2008, 2011-2012 Kooperativer Bibliotheksverbund
 * Berlin-Brandenburg (KOBV) <http://www.kobv.de>,
 * im Konrad-Zuse-Zentrum für Informationstechnik
 * Berlin (ZIB) <http://www.zib.de>, Takustr. 7, D-14195 Berlin-Dahlem
 * Author(s) Jens Schwidder, <schwidder(at)zib.de>,
 *           Pascal-Nicolas Becker, <becker(at)zib.de>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */
package de.kobv.mable.mab.parser;

import de.ddb.charset.MabCharset;
import de.kobv.mable.common.MableConfigurationException;
import de.kobv.mable.mab.MabSatzkennung;
import java.io.*;
import java.util.HashSet;
import java.util.Set;
import org.apache.log4j.Logger;

/**
 *
 * @author Jens Schwidder <schwidder(at)zib.de>
 *
 * TODO class hierarchy clean up (MabDatasetFinder, ...)
 * TODO call error handler
 * TODO call
 */
public class FastMabParser extends AbstractMabParser implements MabParser {

    /**
     * Logger for this class.
     */
    private static final Logger LOG = Logger.getLogger(FastMabParser.class);

    /**
     * Bytewert eines UTF-8-kodierten Carriage return.
     */
    public static final int CARRIAGERETURN = 0x000D;

    /**
     * Bytewert eines UTF-8-kodierten Zeilenumbruchs.
     */
    public static final int LINEFEED = 0x000A;

    /**
     * Maximum size of dataset that can be handled.
     */
    public static final int MAX_DATASET_SIZE = 100000;

    /**
     * Size of buffer for reading.
     * TODO refactor
     */
    private static final int BUFFER_SIZE = 8 * 1024;

    /**
     * Input stream for MAB2 data.
     */
    private InputStream source;

    /**
     * Buffered input stream for MAB2 data.
     */
    private BufferedInputStream bufferedSource;

    /**
     * Counter for read datasets.
     */
    private int datasetCounter;

    /**
     * Last token that was read.
     */
    private int lastToken;

    /**
     * Categories that require their content for processing by handler.
     *
     * For most categories the actual content is ignored. However for some, like
     * MAB001 or MAB010 the content is required to perform checks.
     */
    private Set contentForCategories = new HashSet();

    /**
     * Initializes variables and prepares parsing.
     */
    private void init() {
        datasetCounter = 0;
    }

    /**
     * Finds a dataset in a MAB2 data stream and returns it.
     * @throws IOException if error reading data occurs
     */
    @Override
    public void parse() throws IOException, MableConfigurationException {
        getContentHandler().startParsing();

        init();

        String satzkennung;

        bufferedSource.mark(MAX_DATASET_SIZE);

        // Read Satzkennung
        while ((satzkennung = readTokens(MabSatzkennung.HEADER_LENGTH)) != null) {
            datasetCounter++;

            LOG.debug("Satzkennung (" + datasetCounter + "): " + satzkennung);

            // TODO test interface/performance without object creation
            MabSatzkennung msk = new MabSatzkennung();
            msk.setValue(satzkennung);

            getContentHandler().startSatz(msk);

            LOG.debug("Read fields");

            // Read fields
            readFields();
        }

        getContentHandler().endParsing();
    }

    /**
     *
     * @throws IOException
     *
     * TODO end at SATZENDEZEICHEN
     */
    private void readFields() throws IOException {
        String category;

        while ((category = readCategory()) != null) {
            String feldnummer = category.substring(0, 3);

            String content = null;

//            LOG.debug("  Feld: " + category);

            if (contentForCategories.contains(feldnummer)) {
                content = readFieldContent();
                LOG.debug("    Content (" + category + "): '" + content + "'");
            }

            getContentHandler().startFeld(feldnummer, category.charAt(3), content);

            if (this.lastToken == MabCharset.SATZENDEZEICHEN) {
                break;
            }
            else {
                if (this.lastToken != MabCharset.FELDENDEZEICHEN) {
                    jumpToNextField();
                    if (this.lastToken == MabCharset.SATZENDEZEICHEN) {
                        break;
                    }
                }
            }
        }

        getContentHandler().endSatz();
    }

    private void jumpToNextField() throws IOException {
//        LOG.debug("jumpToNextField");
        int token;
        while ((token = nextToken()) != -1) {
            if (token == MabCharset.FELDENDEZEICHEN || token == MabCharset.SATZENDEZEICHEN) {
//                LOG.debug("end of field found");
                break;
            }
        }
    }

    /**
     * Reads the stream until the end of a field.
     * @return String Content of a field
     * @throws IOException if an error occurs while reading
     */
    private String readFieldContent() throws IOException {
        StringBuilder content = new StringBuilder();

        int token;
        while ((token = nextToken()) != -1) {
            if (token != MabCharset.FELDENDEZEICHEN && token != MabCharset.SATZENDEZEICHEN) {
                content.append(Character.toString(( char )token));
            }
            else {
                break;
            }
        }

        return content.toString();
    }

    /**
     * Reads a number of characters from the data stream.
     * @param length Number of characters
     * @return String containing characters read
     * @throws IOException if error occurs while reading data
     */
    private String readTokens(int length) throws IOException {
        StringBuilder content = new StringBuilder();
        for (int i = 0; i < length; i++) {
            int token = nextToken();

            if (token != -1) {
                content.append(Character.toString(( char )token));
            }
            else {
                // TODO throw exception
                return null;
            }
        }
        return content.toString();
    }

    private String readCategory() throws IOException {
        StringBuilder content = new StringBuilder();
        for (int i = 0; i < 4; i++) {
            int token = nextToken();

            if (token != -1 && token != MabCharset.SATZENDEZEICHEN) {
                content.append(Character.toString(( char )token));
            }
            else {
                // TODO throw exception
                return null;
            }
        }
        return content.toString();
    }

    /**
     * Returns the next character in the data stream.
     *
     * Line feeds and carriage return characters are ignored by this function.
     * Some MAB files contain such characters to separate datasets in addition
     * to the dataset end character specified for MAB2 files.
     *
     * @throws IOException if error reading data occurs
     */
    private int nextToken() throws IOException {
        int newChar = this.bufferedSource.read();

        // ignoriere Zeilenumbrueche
        if (newChar == LINEFEED || newChar == CARRIAGERETURN) {
            newChar = this.nextToken();
        }

        lastToken = newChar;

        return newChar;
    }

    /**
     * Setter for data stream for reading MAB2 data.
     *
     * The stream is internally wrapped using a BufferedInputStream if
     * necessary.
     *
     * @param source InputStream
     */
    @Override
    public void setSource(InputStream source) {
        this.source = source;

        if (source != null) {
            if (source instanceof BufferedInputStream) {
                this.bufferedSource = ( BufferedInputStream )source;
            }
            else {
                this.bufferedSource = new BufferedInputStream(source);
            }
        }
        else {
            this.bufferedSource = null;
        }
    }

    /**
     * Returns number of datasets that were read while searching.
     * @return Integer Number of datasets read
     */
    @Override
    public int getDatasetsProcessed() {
        return datasetCounter;
    }

    @Override
    public int getDatasetsIgnored() {
        return 0;
    }

    @Override
    public void setSource(Reader source) {
        throw new UnsupportedOperationException("Not supported yet.");
    }

    public void setContentForCategories(final Set categories) {
        this.contentForCategories = categories;
    }

}
