/*
 * This file is part of Mable+, a program for checking MAB data for errors.
 *
 * Copyright (C) 2008, 2011-2012 Kooperativer Bibliotheksverbund
 * Berlin-Brandenburg (KOBV) <http://www.kobv.de>,
 * im Konrad-Zuse-Zentrum für Informationstechnik
 * Berlin (ZIB) <http://www.zib.de>, Takustr. 7, D-14195 Berlin-Dahlem
 * Author(s) Jens Schwidder, <schwidder(at)zib.de>,
 *           Pascal-Nicolas Becker, <becker(at)zib.de>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program. If not, see <http://www.gnu.org/licenses/>.
 */
package de.kobv.mable.mab.validation;

import java.util.Arrays;
import org.apache.log4j.Logger;

/**
 * Enth&auml;lt eine statische Methode zum &Uuml;berpr&uuml;fen, ob ein Byte ein
 * g&uuml;ltiges UTF-8 kodiertes MAB-Zeichen enth&auml;lt.
 * <p>Das MAB-Bandformat nutzt einen eigenen MAB-Zeichensatz (siehe
 * <a href="http://www.d-nb.de/standardisierung/pdf/mab_unic.pdf">http://www.d-nb.de/standardisierung/pdf/mab_unic.pdf</a>).
 * MabCharsetChecker enthält alle erlaubten MAB-Zeichen UTF-8 kodiert als Bytes
 * (also den entsprechenden Integerwert des Zeichens). Die statische Methode
 * {@link #checkUTF8bytes(int)} prüft ob einer Integerwert einem legalen in
 * UTF-8 kodierten MAB-Zeichen entspricht.</p>
 * <p>Die Liste der Zeichen stammt aus de.ddb.charset.MabCharset, entwickelt
 * von Jürgen Kett, Deutschen Nationalbibliothek und unter GPL freigegeben.</p>
 * @author Pascal-Nicolas Becker <becker(at)zib.de>
 *
 *
 * TODO Can this be done using MabCharset instead of copying list here?
 */
public final class MabCharsetChecker {

    private static final Logger LOG = Logger.getLogger(MabCharsetChecker.class);

    public static final int VALID_CHARS_START = 0x001D;

    public static final int VALID_CHARS_END = 0x007E;

    public static final int[] VALID_CHARS = {
            0x0098,
            0x009C,
            0x00A1,
            0x00A3,
            0x00A5,
            0x00A7,
            0x00AB,
            0x2021,
            0x00B7,
            0x00BB,
            0x00BF,
            0x034F, // Beides moeglich???
            // following once imported from DDBs MabCharset
            0x201E, // Double Low-9 Quotation Mark
            0x0024, // Dollar Sign
            0x2020, // Dagger
            0x2032, // Prime
            0x2018, // Left Single Quotation Mark
            0x201C, // Left Double Quotation Mark
            0x266D, // Music Flat Sign
            0x00A9, // Copyright Sign
            0x2117, // Sound Recording Copyright
            0x00AE, // Registered Sign
            0x02BB, // Modifier Letter Turned Comma
            0x02BC, // Modifier Letter Apostrophe
            0x201A, // Single Low-9 Quotation Mark
            0x2033, // Double Prime
            0x2019, // Right Single Quotation Mark
            0x201D, // Right Double Quotation Mark
            0x266F, // Music Sharp Sign !!!!NACHFRAGEN
            0x02B9, // Modifier Letter Prime
            0x02BA, // Modifier Letter Double Prime
            0x0309, // Combining Hook above
            0x0300, // Combining Grave Accent
            0x0301, // Combining Acute Accent
            0x0302, // Combining Circumflex Accent
            0x0303, // Combining Tilde
            0x0304, // Combining Macron
            0x0306, // Combining Breve
            0x0307, // Combining Dot Above
            0x0308, // Combining Diaeresis
            0x030A, // Combining Ring Above
            0x0315, // Combining Comma Above Right
            0x0312, // Combining Turned Comma Above
            0x030B, // Combining Double Acute Accent
            0x031B, // Combining Horn
            0x030C, // Combining Caron
            0x0327, // Combining Cedilla
            0x031C, // Combining Left Half Ring Below
            0x0326, // Combining Comma Below
            0x0328, // Combining Ogonek
            0x0325, // Combining Ring Below
            0x032E, // Combining Breve Below
            0x0323, // Combining Dot Below
            0x0324, // Combining Diaeresis Below
            0x0332, // Combining Low Line
            0x0333, // Combining Double Low Line
            0x0329, // Combining Vertical Line Below
            0x032D, // Combining Circumflex Accent Below
            0xFE20, // Combining Ligature Left Half
            0xFE21, // Combining Ligature Right Half
            0xFE23, // Combining Double Tilde Right Half
            0x00C6, // Latin Capital Letter AE
            0x0110, // Latin Capital Letter D with Stroke
            0x0132, // Latin Capital Ligature IJ
            0x0141, // Latin Capital Letter L with Stroke
            0x00D8, // Latin Capital Letter O with Stroke
            0x0152, // Latin Capital Ligature OE
            0x00DE, // Latin Capital Letter Thorn
            0x00E6, // Latin Small Letter AE
            0x0111, // Latin Small Letter D with Stroke
            0x00F0, // Latin Small Letter ETH
            0x0131, // Latin Small Letter Dotless I
            0x0133, // Latin Small Ligature IJ
            0x0142, // Latin Small Letter L with Stroke
            0x00F8, // Latin Small Letter O with Stroke
            0x0153, // Latin Small Ligature OE
            0x00DF, // Latin Small Letter Sharp S
            0x00FE, // Latin Small Letter Thorn
            0x1ea2,
            0x1eba,
            0x1ec8,
            0x1ece,
            0x1ee6,
            0x1ef6,
            0x1ea3,
            0x1ebb,
            0x1ec9,
            0x1ecf,
            0x1ee7,
            0x1ef7,
            0x00c0,
            0x00c8,
            0x00cc,
            0x01f8,
            0x00d2,
            0x00d9,
            0x1e80,
            0x1ef2,
            0x00e0,
            0x00e8,
            0x00ec,
            0x01f9,
            0x00f2,
            0x00f9,
            0x1e81,
            0x1ef3,
            0x00c1,
            0x0106,
            0x00c9,
            0x01f4,
            0x00cd,
            0x1e30,
            0x0139,
            0x1E3E,
            0x0143,
            0x00d3,
            0x1E54,
            0x0154,
            0x015a,
            0x00da,
            0x1e82,
            0x00d0,
            0x0179,
            0x00e1,
            0x0107,
            0x00e9,
            0x01F5,
            0x00ed,
            0x1E31,
            0x013a,
            0x1e3f,
            0x0144,
            0x00f3,
            0x1E55,
            0x0155,
            0x015b,
            0x00fa,
            0x1e83,
            0x00fd,
            0x017a,
            0x0226,
            0x1E02,
            0x010A,
            0x1E0A,
            0x0116,
            0x1E1E,
            0x0120,
            0x1E22,
            0x0130,
            0x1E40,
            0x1E44,
            0x022E,
            0x1E56,
            0x1E58,
            0x1E60,
            0x1E6A,
            0x1E86,
            0x1E8A,
            0x1E8E,
            0x017B,
            0x0227,
            0x1E03,
            0x010B,
            0x1E0B,
            0x0117,
            0x1E1F,
            0x0121,
            0x1E23,
            0x1E41,
            0x1E45,
            0x022F,
            0x1E57,
            0x1E59,
            0x1E61,
            0x1E6B,
            0x1E87,
            0x1E8B,
            0x1E8F,
            0x017C,
            0x00c4,
            0x00cb,
            0x1E26,
            0x00cf,
            0x00d6,
            0x00dc,
            0x1e84,
            0x1E8C,
            0x0187,
            0x00e4,
            0x00eb,
            0x1E27,
            0x00ef,
            0x00f6,
            0x1E97,
            0x00fc,
            0x1E85,
            0x1E8D,
            0x00ff,
            0x00c5,
            0x016e,
            0x00e5,
            0x016f,
            0x1E98,
            0x1E99,
            0x01a0,
            0x01af,
            0x01a1,
            0x01b0,
            0x0160,
            0x0161,
            0x00c7,
            0x1E10,
            0x0228,
            0x0122,
            0x1e28,
            0x0136,
            0x013b,
            0x0145,
            0x0156,
            0x015e,
            0x0162,
            0x007e,
            0x1E11,
            0x0229,
            0x0123,
            0x0e29,
            0x0137,
            0x013c,
            0x0146,
            0x0157,
            0x015f,
            0x0163,
            0x0104,
            0x0118,
            0x012e,
            0x01EA,
            0x0172,
            0x0105,
            0x0119,
            0x012f,
            0x01EB,
            0x0173,
            0x1EA0,
            0x1EB8,
            0x1E38,
            0x1ECC,
            0x1E5A,
            0x1E62,
            0x1EA1,
            0x1EB9,
            0x1E37,
            0x1ECD,
            0x1E5B,
            0x1E63
    };

    static {
        Arrays.sort(VALID_CHARS);
    }

    /**
     * Prüft, ob der Integerwert einem legalen in UTF-8 kodiertem Zeichen des
     * MAB-Zeichensatzes entspricht, oder nicht.
     * @param input der Integerwert des in UTF-8 kodierten Zeichens, dass geprüft werden soll.
     * @return true, wenn der Integerwert einem UTF-8 kodiertem MAB-Zeichen entspricht, sonst false.
     */
    public static boolean checkUTF8bytes(final int input) {
        if ((VALID_CHARS_START <= input && input <= VALID_CHARS_END)
                || Arrays.binarySearch(VALID_CHARS, input) >= 0) {
            return true;
        }
        LOG.debug(String.format("Wrong character 0x%1$04X found.", input));
        return false;
    }

    /**
     *
     */
    private MabCharsetChecker() {
    }

}