pytils/translit.py
author pythy <the.pythy@gmail.com>
Mon Sep 22 23:42:30 2008 +0700 (5 weeks ago)
changeset 105 1ff4355c8413
parent 9695a111988f1a
permissions -rw-r--r--
Make .hgignore
     1 # -*- coding: utf-8 -*-
     2 # -*- test-case-name: pytils.test.test_translit -*-
     3 # pytils - russian-specific string utils
     4 # Copyright (C) 2006-2008  Yury Yurevich
     5 #
     6 # /projects/pytils/
     7 #
     8 # This program is free software; you can redistribute it and/or
     9 # modify it under the terms of the GNU General Public License
    10 # as published by the Free Software Foundation, version 2
    11 # of the License.
    12 #
    13 # This program is distributed in the hope that it will be useful,
    14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
    15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    16 # GNU General Public License for more details.
    17 """
    18 Simple transliteration
    19 """
    20 
    21 import re
    22 from pytils.utils import takes, returns
    23 
    24 TRANSTABLE = (
    25         (u"'", u"'"),
    26         (u'"', u'"'),
    27         (u"‘", u"'"),
    28         (u"’", u"'"),
    29         (u"«", u'"'),
    30         (u"»", u'"'),
    31         (u"“", u'"'),
    32         (u"”", u'"'),
    33         (u"–", u"-"),  # en dash
    34         (u"—", u"-"),  # em dash
    35         (u"‒", u"-"),  # figure dash
    36         (u"−", u"-"),  # minus
    37         (u"…", u"..."),
    38         (u"№", u"#"),
    39         ## верхний регистр
    40         # трехбуквенные замены
    41         (u"Щ", u"Sch"),
    42         # при замене русский->английский будет первая замена,
    43         # т.е. Sch
    44         # а вот если английский->русский, то вариант SCH и Sch --
    45         # оба пройдут
    46         (u"Щ", u"SCH"),
    47         # двухбуквенные замены
    48         (u"Ё", u"Yo"),
    49         (u"Ё", u"YO"),
    50         (u"Ж", u"Zh"),
    51         (u"Ж", u"ZH"),
    52         (u"Ц", u"Ts"),
    53         (u"Ц", u"TS"),
    54         (u"Ч", u"Ch"),
    55         (u"Ч", u"CH"),
    56         (u"Ш", u"Sh"),
    57         (u"Ш", u"SH"),
    58         (u"Ы", u"Yi"),
    59         (u"Ы", u"YI"),
    60         (u"Ю", u"Yu"),
    61         (u"Ю", u"YU"),
    62         (u"Я", u"Ya"),
    63         (u"Я", u"YA"),
    64         # однобуквенные замены
    65         (u"А", u"A"),
    66         (u"Б", u"B"),
    67         (u"В", u"V"),
    68         (u"Г", u"G"),
    69         (u"Д", u"D"),
    70         (u"Е", u"E"),
    71         (u"З", u"Z"),
    72         (u"И", u"I"),
    73         (u"Й", u"J"),
    74         (u"К", u"K"),
    75         (u"Л", u"L"),
    76         (u"М", u"M"),
    77         (u"Н", u"N"),
    78         (u"О", u"O"),
    79         (u"П", u"P"),
    80         (u"Р", u"R"),
    81         (u"С", u"S"),
    82         (u"Т", u"T"),
    83         (u"У", u"U"),
    84         (u"Ф", u"F"),
    85         (u"Х", u"H"),
    86         (u"Э", u"E"),
    87         (u"Ъ", u"`"),
    88         (u"Ь", u"'"),
    89         ## нижний регистр
    90         # трехбуквенные замены
    91         (u"щ", u"sch"),
    92         # двухбуквенные замены
    93         (u"ё", u"yo"),
    94         (u"ж", u"zh"),
    95         (u"ц", u"ts"),
    96         (u"ч", u"ch"),
    97         (u"ш", u"sh"),
    98         (u"ы", u"yi"),
    99         (u"ю", u"yu"),
   100         (u"я", u"ya"),
   101         # однобуквенные замены
   102         (u"а", u"a"),
   103         (u"б", u"b"),
   104         (u"в", u"v"),
   105         (u"г", u"g"),
   106         (u"д", u"d"),
   107         (u"е", u"e"),
   108         (u"з", u"z"),
   109         (u"и", u"i"),
   110         (u"й", u"j"),
   111         (u"к", u"k"),
   112         (u"л", u"l"),
   113         (u"м", u"m"),
   114         (u"н", u"n"),
   115         (u"о", u"o"),
   116         (u"п", u"p"),
   117         (u"р", u"r"),
   118         (u"с", u"s"),
   119         (u"т", u"t"),
   120         (u"у", u"u"),
   121         (u"ф", u"f"),
   122         (u"х", u"h"),
   123         (u"э", u"e"),
   124         (u"ъ", u"`"),
   125         (u"ь", u"'"),
   126         # для полноты английского алфавит (в slugify)
   127         # дополняем английскими буквами, которых
   128         # не в парах
   129         (u"c", u"c"),
   130         (u"q", u"q"),
   131         (u"y", u"y"),
   132         (u"x", u"x"),
   133         (u"w", u"w"),
   134         (u"1", u"1"),
   135         (u"2", u"2"),
   136         (u"3", u"3"),
   137         (u"4", u"4"),
   138         (u"5", u"5"),
   139         (u"6", u"6"),
   140         (u"7", u"7"),
   141         (u"8", u"8"),
   142         (u"9", u"9"),
   143         (u"0", u"0"),
   144         )  #: Translation table
   145 
   146 RU_ALPHABET = [x[0] for x in TRANSTABLE] #: Russian alphabet that we can translate
   147 EN_ALPHABET = [x[1] for x in TRANSTABLE] #: English alphabet that we can detransliterate
   148 ALPHABET = RU_ALPHABET + EN_ALPHABET #: Alphabet that we can (de)transliterate
   149 
   150 @takes(unicode)
   151 @returns(str)
   152 def translify(in_string):
   153     """
   154     Translify russian text
   155 
   156     @param in_string: input string
   157     @type in_string: C{unicode}
   158 
   159     @return: transliterated string
   160     @rtype: C{str}
   161 
   162     @raise L{pytils.err.InputParameterError}: input parameters' check failed
   163         (in_string is not C{unicode})
   164     @raise ValueError: when string doesn't transliterate completely
   165     """
   166     translit = in_string
   167     for symb_in, symb_out in TRANSTABLE:
   168         translit = translit.replace(symb_in, symb_out)
   169 
   170     try:
   171         translit = str(translit)
   172     except UnicodeEncodeError:
   173         raise ValueError("Unicode string doesn't transliterate completely, " + \
   174                          "is it russian?")
   175 
   176     return translit
   177 
   178 @takes(basestring)
   179 @returns(unicode)
   180 def detranslify(in_string):
   181     """
   182     Detranslify
   183 
   184     @param in_string: input string
   185     @type in_string: C{basestring}
   186 
   187     @return: detransliterated string
   188     @rtype: C{unicode}
   189 
   190     @raise L{pytils.err.InputParameterError}: input parameters' check failed
   191         (when in_string not C{basestring})
   192     @raise ValueError: if in_string is C{str}, but it isn't ascii
   193     """
   194     # в unicode
   195     try:
   196         russian = unicode(in_string)
   197     except UnicodeDecodeError:
   198         raise ValueError("We expects if in_string is 8-bit string," + \
   199                          "then it consists only ASCII chars, but now it doesn't. " + \
   200                          "Use unicode in this case.")
   201 
   202     for symb_out, symb_in in TRANSTABLE:
   203         russian = russian.replace(symb_in, symb_out)
   204 
   205     return russian
   206 
   207 @takes(basestring)
   208 @returns(str)
   209 def slugify(in_string):
   210     """
   211     Prepare string for slug (i.e. URL or file/dir name)
   212 
   213     @param in_string: input string
   214     @type in_string: C{basestring}
   215 
   216     @return: slug-string
   217     @rtype: C{str}
   218 
   219     @raise L{pytils.err.InputParameterError}: input parameters' check failed
   220         (when in_string isn't C{unicode} or C{str})
   221     @raise ValueError: if in_string is C{str}, but it isn't ascii
   222     """
   223     try:
   224         u_in_string = unicode(in_string).lower()
   225     except UnicodeDecodeError:
   226         raise ValueError("We expects when in_string is str type," + \
   227                          "it is an ascii, but now it isn't. Use unicode " + \
   228                          "in this case.")
   229     # convert & to "and"
   230     u_in_string = re.sub('\&amp\;|\&', ' and ', u_in_string)
   231     # replace spaces by hyphen
   232     u_in_string = re.sub('[-\s]+', '-', u_in_string)
   233     # remove symbols that not in alphabet
   234     u_in_string = u''.join([symb for symb in u_in_string if symb in ALPHABET])
   235     # translify it
   236     out_string = translify(u_in_string)
   237     # remove non-alpha
   238     return re.sub('[^\w\s-]', '', out_string).strip().lower()
   239 
   240 
   241 def dirify(in_string):
   242     """
   243     Alias for L{slugify}
   244     """
   245     slugify(in_string)