/*
 * This file is part of Mable+, a program for checking MAB data for errors.
 *
 * Copyright (C) 2008, 2011-2012 Kooperativer Bibliotheksverbund
 * Berlin-Brandenburg (KOBV) <http://www.kobv.de>,
 * im Konrad-Zuse-Zentrum für Informationstechnik
 * Berlin (ZIB) <http://www.zib.de>, Takustr. 7, D-14195 Berlin-Dahlem
 * Author(s) Jens Schwidder, <schwidder(at)zib.de>,
 *           Pascal-Nicolas Becker, <becker(at)zib.de>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */
package de.kobv.mable.mab.parser;

import de.ddb.charset.MabCharset;
import de.kobv.mable.mab.MabSatzkennung;
import java.io.*;
import java.util.Set;
import org.apache.log4j.Logger;

/**
 * Finds a dataset in a MAB2 file.
 *
 * For performance the characters are only decoded when necessary.
 *
 * @author Jens Schwidder <schwidder(at)zib.de>
 */
public class MabDatasetFinder {

    /**
     * Logger for this class.
     */
    private static final Logger LOG = Logger.getLogger(MabDatasetFinder.class);

    /**
     * Bytewert eines UTF-8-kodierten Carriage return.
     */
    public static final int CARRIAGERETURN = 0x000D;

    /**
     * Bytewert eines UTF-8-kodierten Zeilenumbruchs.
     */
    public static final int LINEFEED = 0x000A;

    /**
     * Maximum size of dataset that can be handled.
     */
    public static final int MAX_DATASET_SIZE = 100000;

    /**
     * Size of buffer for reading.
     * TODO refactor
     */
    private static final int BUFFER_SIZE = 8 * 1024;

    /**
     * Input stream for MAB2 data.
     */
    private InputStream source;

    /**
     * Buffered input stream for MAB2 data.
     */
    private BufferedInputStream bufferedSource;

    /**
     * Counter for read datasets.
     */
    private int datasetCounter;

    /**
     * Temporary buffer for dataset.
     */
    private byte[] buf;


    /**
     * Initializes variables and prepares parsing.
     */
    public void init() {
        datasetCounter = 0;
        buf = new byte[BUFFER_SIZE];
    }

    /**
     * Finds a dataset in a MAB2 data stream and returns it.
     * @param satzId MAB 001 identifier for dataset
     * @return String containing dataset
     * @throws IOException if error reading data occurs
     */
    public byte[] find(String satzId) throws IOException {
        init();

        int bytesRead;

        String satzkennung;

        bufferedSource.mark(MAX_DATASET_SIZE);

        // Read Satzkennung
        while (jumpTokens(MabSatzkennung.HEADER_LENGTH)) {
            datasetCounter++;

            // got header
            bufferedSource.read(buf, 0, 4);

            String category = new String(buf, 0, 4);
            // LOG.debug("Satzkennung: '" + satzkennung + "'");

            if ("001 ".equals(category)) {
                String currentId = readFieldContent();

                LOG.debug("Found field 001 for " + currentId + " (" + datasetCounter + ")");

                if (satzId.equals(currentId)) {
                    LOG.debug("Satz '" + currentId + "' gefunden.");

                    return readDatasetBytes();
                }

            }
            else {
                // TODO handle if 001 is not first field after Satzkennung
            }

            jumpToNextDataset();
        }

        return null;
    }

    /**
     * Finds multiple datasets.
     * @param identifiers Set<String> of dataset identifiers (MAB001)
     * @param output OutputStream for writing datasets
     * @throws IOException if i/o error occurs
     */
    public void find(final Set<String> identifiers, final OutputStream output)
            throws IOException {
        init();

        LOG.debug("Searching for: " + identifiers);

        int bytesRead;

        String satzkennung;

        bufferedSource.mark(MAX_DATASET_SIZE);

        // Read Satzkennung
        while (jumpTokens(MabSatzkennung.HEADER_LENGTH) && identifiers.size() > 0) {
            datasetCounter++;

            // got header
            bufferedSource.read(buf, 0, 4);

            String category = new String(buf, 0, 4);
            // LOG.debug("Satzkennung: '" + satzkennung + "'");

            if ("001 ".equals(category)) {
                String currentId = readFieldContent();

                LOG.debug("Found field 001 for " + currentId + " (" + datasetCounter + ")");

                if (identifiers.remove(currentId)) {
                    LOG.debug("Satz '" + currentId + "' gefunden.");

                    output.write(readDatasetBytes());
                }

            }
            else {
                // TODO handle if 001 is not first field after Satzkennung
            }

            jumpToNextDataset();
        }
    }

    /**
     * Reads the entire current dataset.
     *
     * The start of the dataset has to be marked using the mark method of the
     * BufferedInputStream class. The data is read until a dataset end character
     * occurs.
     *
     * @return Bytes for current dataset
     * @throws IOException if error occurs while reading data
     *
     * TODO handle EOF while reading
     */
    private byte[] readDatasetBytes() throws IOException {
        bufferedSource.reset();
        ByteArrayOutputStream content = new ByteArrayOutputStream();

        int token;
        while ((token = nextToken()) != -1) {
            content.write(token);

            if (token == MabCharset.SATZENDEZEICHEN) {
                break;
            }
        }

        return content.toByteArray();
    }

    /**
     * Reads data until the current dataset ends.
     * @throws IOException if error occurs while reading data
     */
    private void jumpToNextDataset() throws IOException {
        // read until end of dataset
        // TODO use buffer to be faster
        while (nextToken() != MabCharset.SATZENDEZEICHEN) {}
        bufferedSource.mark(MAX_DATASET_SIZE);
    }

    /**
     * Reads the stream until the end of a field.
     * @return String Content of a field
     * @throws IOException if an error occurs while reading
     */
    private String readFieldContent() throws IOException {
        StringBuilder content = new StringBuilder();

        int token;
        while ((token = bufferedSource.read()) != -1) {
            if (token != MabCharset.FELDENDEZEICHEN) {
                content.append(Character.toString(( char )token));
            }
            else {
                break;
            }
        }

        return content.toString();
    }

    /**
     * Reads a number of characters from the data stream.
     * @param length Number of characters
     * @return String containing characters read
     * @throws IOException if error occurs while reading data
     */
    private String readTokens(int length) throws IOException {
        StringBuilder content = new StringBuilder();
        for (int i = 0; i < length; i++) {
            int token = nextToken();

            if (token != -1) {
                content.append(Character.toString(( char )token));
            }
            else {
                // TODO throw exception
                return null;
            }
        }
        return content.toString();
    }

    /**
     * Returns the next character in the data stream.
     *
     * Line feeds and carriage return characters are ignored by this function.
     * Some MAB files contain such characters to separate datasets in addition
     * to the dataset end character specified for MAB2 files.
     *
     * @throws IOException if error reading data occurs
     */
    private int nextToken() throws IOException {
        int newChar = this.bufferedSource.read();

        // ignoriere Zeilenumbrueche
        if (newChar == LINEFEED || newChar == CARRIAGERETURN) {
            newChar = this.nextToken();
        }

        return newChar;
    }

    /**
     * Jumps over a number of tokens (characters) in data stream.
     * @param length Number of characters to jump over
     * @throws IOException if error occurs while reading data
     */
    private boolean jumpTokens(int length) throws IOException {
        for (int i = 0; i < length; i++) {
            if (nextToken() == -1) {
                return false;
            }
        }

        return true;
    }

    /**
     * Getter for data stream.
     * @return InputStream
     */
    public InputStream getSource() {
        return source;
    }

    /**
     * Setter for data stream for reading MAB2 data.
     *
     * The stream is internally wrapped using a BufferedInputStream if
     * necessary.
     *
     * @param source InputStream
     */
    public void setSource(InputStream source) {
        this.source = source;

        if (source != null) {
            if (source instanceof BufferedInputStream) {
                this.bufferedSource = ( BufferedInputStream )source;
            }
            else {
                this.bufferedSource = new BufferedInputStream(source);
            }
        }
        else {
            this.bufferedSource = null;
        }
    }

    /**
     * Returns number of datasets that were read while searching.
     * @return Integer Number of datasets read
     */
    public int getDatasetCounter() {
        return datasetCounter;
    }

}
