package org.apache.lucene.analysis.el; /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * GreekCharsets class contains encodings schemes (charsets) and toLowerCase() method implementation * for greek characters in Unicode, ISO-8859-7 and Microsoft Windows CP1253. * <p> * Each encoding scheme contains lowercase (positions 0-35) and uppercase (position 36-68) characters, * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding * the definition of a new charset as well as the required logic in the toLowerCase() method. * </p> * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ public class GreekCharsets { // Unicode Greek charset public static char[] UnicodeGreek = { // lower case '\u0390', '\u03AC', '\u03AD', '\u03AE', '\u03AF', '\u03B0', '\u03B1', '\u03B2', '\u03B3', '\u03B4', '\u03B5', '\u03B6', '\u03B7', '\u03B8', '\u03B9', '\u03BA', '\u03BB', '\u03BC', '\u03BD', '\u03BE', '\u03BF', '\u03C0', '\u03C1', '\u03C2', '\u03C3', '\u03C4', '\u03C5', '\u03C6', '\u03C7', '\u03C8', '\u03C9', '\u03CA', '\u03CB', '\u03CC', '\u03CD', '\u03CE', // upper case '\u0386', '\u0388', '\u0389', '\u038A', '\u038C', '\u038E', '\u038F', '\u0391', '\u0392', '\u0393', '\u0394', '\u0395', '\u0396', '\u0397', '\u0398', '\u0399', '\u039A', '\u039B', '\u039C', '\u039D', '\u039E', '\u039F', '\u03A0', '\u03A1', '\u03A3', '\u03A4', '\u03A5', '\u03A6', '\u03A7', '\u03A8', '\u03A9', '\u03AA', '\u03AB' }; // ISO-8859-7 charset (ELOT-928) public static char[] ISO = { // lower case 0xc0, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, // upper case 0xb6, 0xb8, 0xb9, 0xba, 0xbc, 0xbe, 0xbf, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb }; // CP1253 charset public static char[] CP1253 = { // lower case 0xc0, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, // upper case 0xa2, 0xb8, 0xb9, 0xba, 0xbc, 0xbe, 0xbf, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb }; public static char toLowerCase(char letter, char[] charset) { if (charset == UnicodeGreek) { // First deal with lower case, not accented letters if (letter >= '\u03B1' && letter <= '\u03C9') { // Special case 'small final sigma', where we return 'small sigma' if (letter == '\u03C2') { return '\u03C3'; } else { return letter; } } // Then deal with lower case, accented letters // alpha with acute if (letter == '\u03AC') { return '\u03B1'; } // epsilon with acute if (letter == '\u03AD') { return '\u03B5'; } // eta with acute if (letter == '\u03AE') { return '\u03B7'; } // iota with acute, iota with diaeresis, iota with acute and diaeresis if (letter == '\u03AF' || letter == '\u03CA' || letter == '\u0390') { return '\u03B9'; } // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis if (letter == '\u03CD' || letter == '\u03CB' || letter == '\u03B0') { return '\u03C5'; } // omicron with acute if (letter == '\u03CC') { return '\u03BF'; } // omega with acute if (letter == '\u03CE') { return '\u03C9'; } // After that, deal with upper case, not accented letters if (letter >= '\u0391' && letter <= '\u03A9') { return (char) (letter + 32); } // Finally deal with upper case, accented letters // alpha with acute if (letter == '\u0386') { return '\u03B1'; } // epsilon with acute if (letter == '\u0388') { return '\u03B5'; } // eta with acute if (letter == '\u0389') { return '\u03B7'; } // iota with acute, iota with diaeresis if (letter == '\u038A' || letter == '\u03AA') { return '\u03B9'; } // upsilon with acute, upsilon with diaeresis if (letter == '\u038E' || letter == '\u03AB') { return '\u03C5'; } // omicron with acute if (letter == '\u038C') { return '\u03BF'; } // omega with acute if (letter == '\u038F') { return '\u03C9'; } } else if (charset == ISO) { // First deal with lower case, not accented letters if (letter >= 0xe1 && letter <= 0xf9) { // Special case 'small final sigma', where we return 'small sigma' if (letter == 0xf2) { return 0xf3; } else { return letter; } } // Then deal with lower case, accented letters // alpha with acute if (letter == 0xdc) { return 0xe1; } // epsilon with acute if (letter == 0xdd) { return 0xe5; } // eta with acute if (letter == 0xde) { return 0xe7; } // iota with acute, iota with diaeresis, iota with acute and diaeresis if (letter == 0xdf || letter == 0xfa || letter == 0xc0) { return '\u03B9'; } // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis if (letter == 0xfd || letter == 0xfb || letter == 0xe0) { return 0xf5; } // omicron with acute if (letter == 0xfc) { return 0xef; } // omega with acute if (letter == 0xfe) { return 0xf9; } // After that, deal with upper case, not accented letters if (letter >= 0xc1 && letter <= 0xd9) { return (char) (letter + 32); } // Finally deal with upper case, accented letters // alpha with acute if (letter == 0xb6) { return 0xe1; } // epsilon with acute if (letter == 0xb8) { return 0xe5; } // eta with acute if (letter == 0xb9) { return 0xe7; } // iota with acute, iota with diaeresis if (letter == 0xba || letter == 0xda) { return 0xe9; } // upsilon with acute, upsilon with diaeresis if (letter == 0xbe || letter == 0xdb) { return 0xf5; } // omicron with acute if (letter == 0xbc) { return 0xef; } // omega with acute if (letter == 0xbf) { return 0xf9; } } else if (charset == CP1253) { // First deal with lower case, not accented letters if (letter >= 0xe1 && letter <= 0xf9) { // Special case 'small final sigma', where we return 'small sigma' if (letter == 0xf2) { return 0xf3; } else { return letter; } } // Then deal with lower case, accented letters // alpha with acute if (letter == 0xdc) { return 0xe1; } // epsilon with acute if (letter == 0xdd) { return 0xe5; } // eta with acute if (letter == 0xde) { return 0xe7; } // iota with acute, iota with diaeresis, iota with acute and diaeresis if (letter == 0xdf || letter == 0xfa || letter == 0xc0) { return '\u03B9'; } // upsilon with acute, upsilon with diaeresis, upsilon with acute and diaeresis if (letter == 0xfd || letter == 0xfb || letter == 0xe0) { return 0xf5; } // omicron with acute if (letter == 0xfc) { return 0xef; } // omega with acute if (letter == 0xfe) { return 0xf9; } // After that, deal with upper case, not accented letters if (letter >= 0xc1 && letter <= 0xd9) { return (char) (letter + 32); } // Finally deal with upper case, accented letters // alpha with acute if (letter == 0xa2) { return 0xe1; } // epsilon with acute if (letter == 0xb8) { return 0xe5; } // eta with acute if (letter == 0xb9) { return 0xe7; } // iota with acute, iota with diaeresis if (letter == 0xba || letter == 0xda) { return 0xe9; } // upsilon with acute, upsilon with diaeresis if (letter == 0xbe || letter == 0xdb) { return 0xf5; } // omicron with acute if (letter == 0xbc) { return 0xef; } // omega with acute if (letter == 0xbf) { return 0xf9; } } return Character.toLowerCase(letter); } }