pytils/typo.py
author pythy <the.pythy@gmail.com>
Mon Sep 22 23:42:30 2008 +0700 (4 weeks ago)
changeset 105 1ff4355c8413
parent 983a022fa3e097
permissions -rw-r--r--
Make .hgignore
     1 # -*- coding: utf-8 -*-
     2 # -*- test-case-name: pytils.test.test_typo -*-
     3 # pytils - russian-specific string utils
     4 # Copyright (C) 2006-2008  Yury Yurevich
     5 #
     6 # /projects/pytils/
     7 #
     8 # This program is free software; you can redistribute it and/or
     9 # modify it under the terms of the GNU General Public License
    10 # as published by the Free Software Foundation, version 2
    11 # of the License.
    12 #
    13 # This program is distributed in the hope that it will be useful,
    14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
    15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    16 # GNU General Public License for more details.
    17 """
    18 Russian typography
    19 """
    20 import re
    21 import os
    22 
    23 def _sub_patterns(patterns, text):
    24     """
    25     Apply re.sub to bunch of (pattern, repl)
    26     """
    27     for pattern, repl in patterns:
    28         text = re.sub(pattern, repl, text)
    29     return text    
    30 
    31 ## ---------- rules -------------
    32 # rules is a regular function, 
    33 # name convention is rl_RULENAME
    34 def rl_testrule(x):
    35     """
    36     Rule for tests. Do nothing.
    37     """
    38     return x
    39 
    40 def rl_cleanspaces(x):
    41     """
    42     Clean double spaces, trailing spaces, heading spaces,
    43     spaces before punctuations 
    44     """
    45     patterns = (
    46         # arguments for re.sub: pattern and repl
    47         # удаляем пробел перед знаками препинания
    48         (r' +([\.,?!\)]+)', r'\1'),
    49         # добавляем пробел после знака препинания, если только за ним нет другого
    50         (r'([\.,?!\)]+)([^\.!,?\)]+)', r'\1 \2'),
    51         # убираем пробел после открывающей скобки
    52         (r'(\S+)\s*(\()\s*(\S+)', r'\1 (\3'),
    53     )
    54     # удаляем двойные, начальные и конечные пробелы
    55     return os.linesep.join(
    56         ' '.join(part for part in line.split(' ') if part)
    57         for line in _sub_patterns(patterns, x).split(os.linesep)
    58     )
    59 
    60 def rl_ellipsis(x):
    61     """
    62     Replace three dots to ellipsis
    63     """
    64 
    65     patterns = (
    66         # если больше трех точек, то не заменяем на троеточие
    67         # чтобы не было глупых .....->…..
    68         (r'([^\.]|^)\.\.\.([^\.]|$)', u'\\1\u2026\\2'),
    69         # если троеточие в начале строки или возле кавычки --
    70         # это цитата, пробел между троеточием и первым
    71         # словом нужно убрать
    72         (re.compile(u'(^|\\"|\u201c|\xab)\\s*\u2026\\s*([А-Яа-яA-Za-z])', re.UNICODE), u'\\1\u2026\\2'),
    73         
    74     )
    75     return _sub_patterns(patterns, x)
    76 
    77 def rl_initials(x):
    78     """
    79     Replace space between initials and surname by thin space
    80     """
    81     return re.sub(
    82         re.compile(u'([А-Я])\\.\\s*([А-Я])\\.\\s*([А-Я][а-я]+)', re.UNICODE),
    83         u'\\1.\\2.\u2009\\3',
    84         x
    85     )
    86 
    87 def rl_dashes(x):
    88     """
    89     Replace dash to long/medium dashes
    90     """
    91     patterns = (
    92         # тире
    93         (re.compile(u'(^|(.\\s))\\-\\-?(([\\s\u202f].)|$)', re.MULTILINE|re.UNICODE), u'\\1\u2014\\3'),
    94         # диапазоны между цифрами - en dash
    95         (re.compile(u'(\\d[\\s\u2009]*)\\-([\\s\u2009]*\d)', re.MULTILINE|re.UNICODE), u'\\1\u2013\\2'),
    96         # TODO: а что с минусом?
    97     )
    98     return _sub_patterns(patterns, x)
    99 
   100 def rl_wordglue(x):
   101     """
   102     Glue (set nonbreakable space) short words with word before/after
   103     """
   104     patterns = (
   105         # частицы склеиваем с предыдущим словом
   106         (re.compile(u'(\\s+)(же|ли|ль|бы|б|ж|ка)([\\.,!\\?:;]?\\s+)', re.UNICODE), u'\u202f\\2\\3'),
   107         # склеиваем короткие слова со следующим словом
   108         (re.compile(u'\\b([a-zA-ZА-Яа-я]{1,3})(\\s+)', re.UNICODE), u'\\1\u202f'),
   109         # склеиваем тире с предыдущим словом 
   110         (re.compile(u'(\\s+)([\u2014\\-]+)(\\s+)', re.UNICODE), u'\u202f\\2\\3'),
   111         # склеиваем два последних слова в абзаце между собой
   112         # полагается, что абзацы будут передаваться отдельной строкой
   113         (re.compile(u'([^\\s]+)\\s+([^\\s]+)$', re.UNICODE), u'\\1\u202f\\2'),
   114     )
   115     return _sub_patterns(patterns, x)
   116 
   117 def rl_marks(x):
   118     """
   119     Replace +-, (c), (tm), (r), (p), etc by its typographic eqivalents
   120     """
   121     # простые замены, можно без регулярок
   122     replacements = (
   123         (u'(r)', u'\u00ae'), # ®
   124         (u'(R)', u'\u00ae'), # ®
   125         (u'(p)', u'\u00a7'), # §
   126         (u'(P)', u'\u00a7'), # §
   127         (u'(tm)', u'\u2122'), # ™
   128         (u'(TM)', u'\u2122'), # ™
   129     )
   130     patterns = (
   131         # копирайт ставится до года: © 2008 Юрий Юревич
   132         (re.compile(u'\\([cCсС]\\)\\s*(\\d+)', re.UNICODE), u'\u00a9\u202f\\1'),
   133         (r'([^+])(\+\-|\-\+)', u'\\1\u00b1'), # ±
   134         # градусы с минусом
   135         (u'\\-(\\d+)[\\s]*([FCС][^\\w])', u'\u2212\\1\202f\u00b0\\2'), # −12 °C, −53 °F
   136         # градусы без минуса
   137         (u'(\\d+)[\\s]*([FCС][^\\w])', u'\\1\u202f\u00b0\\2'), # 12 °C, 53 °F
   138         # ® и ™ приклеиваются к предыдущему слову, без пробела
   139         (re.compile(u'([A-Za-zА-Яа-я\\!\\?])\\s*(\xae|\u2122)', re.UNICODE), u'\\1\\2'),
   140         # No5 -> № 5
   141         (re.compile(u'(\\s)(No|no|NO|\u2116)[\\s\u2009]*(\\d+)', re.UNICODE), u'\\1\u2116\u2009\\3'),
   142     )
   143 
   144     for what, to in replacements:
   145         x = x.replace(what, to)
   146     return _sub_patterns(patterns, x)
   147 
   148 def rl_quotes(x):
   149     """
   150     Replace quotes by typographic quotes
   151     """
   152     patterns = (
   153         # открывающие кавычки ставятся обычно вплотную к слову слева
   154         # а закрывающие -- вплотную справа
   155         # открывающие русские кавычки-ёлочки
   156         (re.compile(u'(^|\\s|\\()\\"([А-Яа-я0-9,\\-:\\/\\.])', re.UNICODE), u'\\1\xab\\2'),
   157         # закрывающие русские кавычки-ёлочки
   158         # ищем открывающую кавычку-ёлочку и следующую закрывающую кавычку
   159         (re.compile(u'\xab([^\\"])([А-Яа-яA-Za-z0-9,\\-:\\/\\.\\?\\!\\s]+)\\"', re.UNICODE), u'\xab\\1\\2\xbb'),
   160         # открывающие кавычки-лапки
   161         (re.compile(u'(^|\\s|\\()\\"([A-Za-z0-9,\\-:\\/\\.\\?\\!\\&])', re.UNICODE), u'\\1\u201c\\2'),
   162         # закрывающие русские кавычки-лапки
   163         # ищем открывающую кавычку-лапку и следующую закрывающую кавычку
   164         (re.compile(u'\u201c([^\\"])([A-Za-zА-Яа-я0-9,\\-:\\/\\.\\?\\!\\&\\s]+)\\"', re.UNICODE), u'\u201c\\1\\2\u201d'),
   165     )
   166     return _sub_patterns(patterns, x)
   167     
   168 
   169 ## -------- rules end ----------
   170 STANDARD_RULES = ('cleanspaces', 'ellipsis', 'initials', 'marks', 'dashes', 'wordglue', 'quotes')
   171 
   172 def _get_rule_by_name(name):
   173 
   174     rule = globals().get('rl_%s' % name)
   175     if rule is None:
   176         raise ValueError("Rule %s is not found" % name)
   177     if not callable(rule):
   178         raise ValueError("Rule with name %s is not callable" % name)
   179     return rule
   180 
   181 def _resolve_rule_name(rule_or_name, forced_name=None):
   182     if isinstance(rule_or_name, str):
   183         # got name
   184         name = rule_or_name
   185         rule = _get_rule_by_name(name)
   186     elif callable(rule_or_name):
   187         # got rule
   188         name = rule_or_name.__name__
   189         if name.startswith('rl_'):
   190             # by rule name convention
   191             # rule is a function with name rl_RULENAME
   192             name = name[3:]
   193         rule = rule_or_name
   194     else:
   195         raise ValueError(
   196             "Cannot resolve %r: neither rule, nor name" % 
   197             rule_or_name)
   198     if forced_name is not None:
   199         name = forced_name
   200     return name, rule
   201 
   202 class Typography(object):
   203     """
   204     Russian typography rules applier
   205     """
   206     def __init__(self, *args, **kwargs):
   207         """
   208         Typography applier constructor:
   209         
   210         possible variations of constructing rules chain:
   211             rules by it's names:
   212                 Typography('first_rule', 'second_rule')
   213             rules callables as is:
   214                 Typography(cb_first_rule, cb_second_rule)
   215             mixed:
   216                 Typography('first_rule', cb_second_rule)
   217             as list:
   218                 Typography(['first_rule', cb_second_rule])
   219             as keyword args:
   220                 Typography(rule_name='first_rule', 
   221                            another_rule=cb_second_rule)
   222             as dict (order of rule execution is not the same):
   223                 Typography({'rule name': 'first_rule', 
   224                             'another_rule': cb_second_rule})
   225         
   226         For standard rules it is recommended to use list of rules
   227         names.
   228             Typography(['first_rule', 'second_rule'])
   229         
   230         For custom rules which are named functions, 
   231         it is recommended to use list of callables:
   232             Typography([cb_first_rule, cb_second_rule])
   233         
   234         For custom rules which are lambda-functions,
   235         it is recommended to use dict:
   236             Typography({'rule_name': lambda x: x})
   237             
   238         I.e. the recommended usage is:
   239             Typography(['standard_rule_1', 'standard_rule_2'],
   240                        [cb_custom_rule1, cb_custom_rule_2],
   241                        {'custom_lambda_rule': lambda x: x})
   242         """     
   243         self.rules = {}
   244         self.rules_names = []
   245         # first of all, expand args-lists and args-dicts
   246         expanded_args = []
   247         expanded_kwargs = {}
   248         for arg in args:
   249             if isinstance(arg, (tuple, list)):
   250                 expanded_args += list(arg)
   251             elif isinstance(arg, dict):
   252                 expanded_kwargs.update(arg)
   253             elif isinstance(arg, str) or callable(arg):
   254                 expanded_args.append(arg)
   255             else:
   256                 raise TypeError(
   257                     "Cannot expand arg %r, must be tuple, list,"\
   258                     " dict, str or callable, not" % 
   259                     (arg, type(arg).__name__))
   260         for kw, arg in kwargs.items():
   261             if isinstance(arg, str) or callable(arg):
   262                 expanded_kwargs[kw] = arg
   263             else:
   264                 raise TypeError(
   265                     "Cannot expand kwarg %r, must be str or "\
   266                     "callable, not" % (arg, type(arg).__name__))
   267         # next, resolve rule names to callables
   268         for name, rule in (_resolve_rule_name(a) for a in expanded_args):
   269             self.rules[name] = rule
   270             self.rules_names.append(name)
   271         for name, rule in (_resolve_rule_name(a, k) for k, a in expanded_kwargs.items()):
   272             self.rules[name] = rule
   273             self.rules_names.append(name)
   274         
   275     def apply_single_rule(self, rulename, text):
   276         if rulename not in self.rules:
   277             raise ValueError("Rule %s is not found in active rules" % rulename)
   278         try:
   279             res = self.rules[rulename](text)
   280         except ValueError, e:
   281             raise ValueError("Rule %s failed to apply: %s" % (rulename, e))
   282         return res
   283     
   284     def apply(self, text):
   285         for rule in self.rules_names:
   286             text = self.apply_single_rule(rule, text)
   287         return text
   288         
   289     def __call__(self, text):
   290         return self.apply(text)
   291 
   292 def typography(text):
   293     t = Typography(STANDARD_RULES)
   294     return t.apply(text)
   295 
   296 if __name__ == '__main__':
   297     from pytils.test import run_tests_from_module, test_typo
   298     run_tests_from_module(test_typo, verbosity=2)
   299