/*
 * This file is part of Mable+, a program for checking MAB data for errors.
 *
 * Copyright (C) 2008, 2011-2012 Kooperativer Bibliotheksverbund
 * Berlin-Brandenburg (KOBV) <http://www.kobv.de>,
 * im Konrad-Zuse-Zentrum für Informationstechnik
 * Berlin (ZIB) <http://www.zib.de>, Takustr. 7, D-14195 Berlin-Dahlem
 * Author(s) Jens Schwidder, <schwidder(at)zib.de>,
 *           Pascal-Nicolas Becker, <becker(at)zib.de>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */
package de.kobv.mable.mab.parser;

import de.ddb.charset.MabCharset;
import de.kobv.mable.mab.MabSatzkennung;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import org.apache.log4j.Logger;

/**
 *
 * @author Jens Schwidder <schwidder(at)zib.de>
 *
 * TODO ignore line feeds and carriage returns
 */
public class MabDatasetIndexer {

    /**
     * Logger for this class.
     */
    private static final Logger LOG = Logger.getLogger(de.kobv.mable.mab.parser.MabDatasetIndexer.class);

    /**
     * Bytewert eines UTF-8-kodierten Carriage return.
     */
    public static final int CARRIAGERETURN = 0x000D;

    /**
     * Bytewert eines UTF-8-kodierten Zeilenumbruchs.
     */
    public static final int LINEFEED = 0x000A;

    /**
     * Input stream for MAB2 data.
     */
    private InputStream source;

    /**
     * Writer for index.
     */
    private Writer output;

    /**
     * Counter for read datasets.
     */
    private int datasetCounter;

    /**
     * Current position in datafile.
     */
    private long position;

    /**
     * Start position of last dataset.
     */
    private long datasetStart;


    /**
     * Initializes variables and prepares parsing.
     */
    public void init() {
        position = 0;
        datasetStart = 0;
        datasetCounter = 0;
    }

    /**
     * Finds a dataset in a MAB2 data stream and returns it.
     */
    public void index() throws IOException {
        String newLine = System.getProperty("line.separator");

        Writer out = getOutput();

        init();

        // Read Satzkennung
        while (jumpTokens(MabSatzkennung.HEADER_LENGTH)) {
            datasetCounter++;

            String currentId = "NONE";

            String category = readTokens(4);

            if ("001 ".equals(category)) {
                currentId = readFieldContent();

                LOG.debug("Found field 001 for " + currentId + " ("
                        + datasetCounter + ")");
            }

            String line = String.format("%1$s, %2$d" + newLine, currentId,
                    datasetStart);

            out.write(line);

            jumpToNextDataset();
        }
    }

    /**
     * Reads data until the current dataset ends.
     * @throws IOException if error occurs while reading data
     */
    private void jumpToNextDataset() throws IOException {
        while (nextToken() != MabCharset.SATZENDEZEICHEN) {}
        datasetStart = position;
    }

    /**
     * Reads the stream until the end of a field.
     * @return String Content of a field
     * @throws IOException if an error occurs while reading
     */
    private String readFieldContent() throws IOException {
        StringBuilder content = new StringBuilder();

        int token;
        while ((token = nextToken()) != -1) {
            if (token != MabCharset.FELDENDEZEICHEN) {
                content.append(Character.toString(( char )token));
            }
            else {
                break;
            }
        }

        return content.toString();
    }

    /**
     * Reads a number of characters from the data stream.
     * @param length Number of characters
     * @return String containing characters read
     * @throws IOException if error occurs while reading data
     */
    private String readTokens(final int length) throws IOException {
        StringBuilder content = new StringBuilder();
        for (int i = 0; i < length; i++) {
            int token = nextToken();

            if (token != -1) {
                content.append(Character.toString(( char )token));
            }
            else {
                // TODO throw exception
                return null;
            }
        }
        return content.toString();
    }

    /**
     * Returns the next character in the data stream.
     *
     * Line feeds and carriage return characters are ignored by this function.
     * Some MAB files contain such characters to separate datasets in addition
     * to the dataset end character specified for MAB2 files.
     *
     * @throws IOException if error reading data occurs
     */
    private int nextToken() throws IOException {
        int newChar = this.source.read();

        position++;

        // ignoriere Zeilenumbrueche
        if (newChar == LINEFEED || newChar == CARRIAGERETURN) {
            newChar = this.nextToken();
        }

        return newChar;
    }

    /**
     * Jumps over a number of tokens (characters) in data stream.
     * @param length Number of characters to jump over
     * @throws IOException if error occurs while reading data
     */
    private boolean jumpTokens(final int length) throws IOException {
        for (int i = 0; i < length; i++) {
            if (nextToken() == -1) {
                return false;
            }
        }

        return true;
    }

    /**
     * Getter for data stream.
     * @return InputStream
     */
    public InputStream getSource() {
        return source;
    }

    /**
     * Setter for data stream for reading MAB2 data.
     *
     * The stream is internally wrapped using a BufferedInputStream if
     * necessary.
     *
     * @param source InputStream
     */
    public void setSource(final InputStream source) {
        this.source = source;
    }

    /**
     * Returns number of datasets that were read while searching.
     * @return Integer Number of datasets read
     */
    public int getDatasetCounter() {
        return datasetCounter;
    }

    public Writer getOutput() {
        return output;
    }

    public void setOutput(Writer output) {
        this.output = output;
    }

}
