xhtml.py 16 KB
Newer Older
1
# Copyright 2010-2011 Florent Le Coz <louiz@louiz.org>
2 3 4 5
#
# This file is part of Poezio.
#
# Poezio is free software: you can redistribute it and/or modify
6
# it under the terms of the zlib license. See the COPYING file.
7 8 9 10 11 12 13 14

"""
Various methods to convert
shell colors to poezio colors,
xhtml code to shell colors,
poezio colors to xhtml code
"""

15
import base64
16
import curses
17 18 19
import hashlib
import re
from os import path
louiz’'s avatar
louiz’ committed
20
from slixmpp.xmlstream import ET
21
from urllib.parse import unquote
22

23 24 25
from io import BytesIO
from xml import sax
from xml.sax import saxutils
26

27
digits = '0123456789' # never trust the modules
mathieui's avatar
mathieui committed
28

29 30
XHTML_NS = 'http://www.w3.org/1999/xhtml'

mathieui's avatar
mathieui committed
31
# HTML named colors
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
colors = {
    'aliceblue': 231,
    'antiquewhite': 231,
    'aqua': 51,
    'aquamarine': 122,
    'azure': 231,
    'beige': 231,
    'bisque': 230,
    'black': 232,
    'blanchedalmond': 230,
    'blue': 21,
    'blueviolet': 135,
    'brown': 124,
    'burlywood': 223,
    'cadetblue': 109,
    'chartreuse': 118,
    'chocolate': 172,
    'coral': 209,
    'cornflowerblue': 111,
    'cornsilk': 231,
    'crimson': 197,
    'cyan': 51,
    'darkblue': 19,
    'darkcyan': 37,
    'darkgoldenrod': 178,
    'darkgray': 247,
    'darkgreen': 28,
    'darkgrey': 247,
    'darkkhaki': 186,
    'darkmagenta': 127,
    'darkolivegreen': 65,
    'darkorange': 214,
    'darkorchid': 134,
    'darkred': 124,
    'darksalmon': 216,
    'darkseagreen': 151,
    'darkslateblue': 61,
    'darkslategray': 59,
    'darkslategrey': 59,
    'darkturquoise': 44,
    'darkviolet': 128,
    'deeppink': 199,
    'deepskyblue': 45,
    'dimgray': 241,
    'dimgrey': 241,
    'dodgerblue': 39,
    'firebrick': 160,
    'floralwhite': 231,
    'forestgreen': 34,
    'fuchsia': 201,
    'gainsboro': 252,
    'ghostwhite': 231,
    'gold': 226,
    'goldenrod': 214,
    'gray': 244,
    'green': 34,
    'greenyellow': 191,
    'grey': 244,
    'honeydew': 231,
    'hotpink': 212,
    'indianred': 174,
    'indigo': 55,
    'ivory': 231,
    'khaki': 229,
    'lavender': 231,
    'lavenderblush': 231,
    'lawngreen': 118,
    'lemonchiffon': 230,
    'lightblue': 195,
    'lightcoral': 217,
    'lightcyan': 231,
    'lightgoldenrodyellow': 230,
    'lightgray': 251,
    'lightgreen': 157,
    'lightgrey': 251,
    'lightpink': 224,
    'lightsalmon': 216,
    'lightseagreen': 43,
    'lightskyblue': 153,
    'lightslategray': 109,
    'lightslategrey': 109,
    'lightsteelblue': 189,
    'lightyellow': 231,
    'lime': 46,
    'limegreen': 77,
    'linen': 231,
    'magenta': 201,
    'maroon': 124,
    'mediumaquamarine': 115,
    'mediumblue': 20,
    'mediumorchid': 170,
    'mediumpurple': 141,
    'mediumseagreen': 78,
    'mediumslateblue': 105,
    'mediumspringgreen': 49,
    'mediumturquoise': 80,
    'mediumvioletred': 163,
    'midnightblue': 18,
    'mintcream': 231,
    'mistyrose': 231,
    'moccasin': 230,
    'navajowhite': 230,
    'navy': 19,
    'oldlace': 231,
    'olive': 142,
    'olivedrab': 106,
    'orange': 214,
    'orangered': 202,
    'orchid': 213,
    'palegoldenrod': 229,
    'palegreen': 157,
    'paleturquoise': 195,
    'palevioletred': 211,
    'papayawhip': 231,
    'peachpuff': 230,
    'peru': 179,
    'pink': 224,
    'plum': 219,
    'powderblue': 195,
    'purple': 127,
    'red': 196,
    'rosybrown': 181,
    'royalblue': 69,
    'saddlebrown': 130,
    'salmon': 216,
    'sandybrown': 216,
    'seagreen': 72,
    'seashell': 231,
    'sienna': 131,
    'silver': 250,
    'skyblue': 153,
    'slateblue': 104,
    'slategray': 109,
    'slategrey': 109,
    'snow': 231,
    'springgreen': 48,
    'steelblue': 74,
    'tan': 187,
    'teal': 37,
    'thistle': 225,
    'tomato': 209,
    'turquoise': 86,
    'violet': 219,
    'wheat': 230,
    'white': 255,
    'whitesmoke': 255,
    'yellow': 226,
    'yellowgreen': 149
}
181

louiz’'s avatar
tg  
louiz’ committed
182
whitespace_re = re.compile(r'\s+')
183

mathieui's avatar
mathieui committed
184
xhtml_attr_re = re.compile(r'\x19-?\d[^}]*}|\x19[buaio]')
185
xhtml_data_re = re.compile(r'data:image/([a-z]+);base64,(.+)')
186 187
poezio_color_double = re.compile(r'(?:\x19\d+}|\x19\d)+(\x19\d|\x19\d+})')
poezio_format_trim = re.compile(r'(\x19\d+}|\x19\d|\x19[buaio]|\x19o)+\x19o')
188

189 190
xhtml_simple_attr_re = re.compile(r'\x19\d')

191 192
def get_body_from_message_stanza(message, use_xhtml=False,
                                 tmp_dir=None, extract_images=False):
193 194 195 196 197
    """
    Returns a string with xhtml markups converted to
    poezio colors if there's an xhtml_im element, or
    the body (without any color) otherwise
    """
198
    if use_xhtml:
199 200
        xhtml = message['html'].xml
        xhtml_body = xhtml.find('{http://www.w3.org/1999/xhtml}body')
louiz’'s avatar
louiz’ committed
201
        if xhtml_body:
202 203
            content = xhtml_to_poezio_colors(xhtml_body, tmp_dir=tmp_dir,
                                             extract_images=extract_images)
mathieui's avatar
mathieui committed
204 205
            content = content if content else message['body']
            return content or " "
206 207
    return message['body']

208 209 210 211 212 213 214
def ncurses_color_to_html(color):
    """
    Takes an int between 0 and 256 and returns
    a string of the form #XXXXXX representing an
    html color.
    """
    if color <= 15:
215 216 217 218
        try:
            (r, g, b) = curses.color_content(color)
        except: # fallback in faulty terminals (e.g. xterm)
            (r, g, b) = curses.color_content(color%8)
219 220 221
        r = r / 1000 * 6 - 0.01
        g = g / 1000 * 6 - 0.01
        b = b / 1000 * 6 - 0.01
222 223 224 225 226 227 228 229 230 231
    elif color <= 231:
        color = color - 16
        r = color % 6
        color = color / 6
        g = color % 6
        color = color / 6
        b = color % 6
    else:
        color -= 232
        r = g = b = color / 24 * 6
232
    return '#%02X%02X%02X' % (int(r*256/6), int(g*256/6), int(b*256/6))
233

234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
def parse_css(css):
    def get_color(value):
        if value[0] == '#':
            value = value[1:]
            length = len(value)
            if length != 3 and length != 6:
                return -1
            value = int(value, 16)
            if length == 6:
                r = int(value >> 16)
                g = int((value >> 8) & 0xff)
                b = int(value & 0xff)
                if r == g == b:
                    return 232 + int(r/10.6251)
                div = 42.51
249
            else:
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280
                r = int(value >> 8)
                g = int((value >> 4) & 0xf)
                b = int(value & 0xf)
                if r == g == b:
                    return 232 + int(1.54*r)
                div = 2.51
            return 6*6*int(r/div) + 6*int(g/div) + int(b/div) + 16
        if value in colors:
            return colors[value]
        return -1
    shell = ''
    rules = css.split(';')
    for rule in rules:
        if ':' not in rule:
            continue
        key, value = rule.split(':', 1)
        key = key.strip()
        value = value.strip()
        if key == 'background-color':
            pass#shell += '\x191'
        elif key == 'color':
            color = get_color(value)
            if color != -1:
                shell += '\x19%d}' % color
        elif key == 'font-style':
            shell += '\x19i'
        elif key == 'font-weight':
            shell += '\x19b'
        elif key == 'margin-left':
            shell += '    '
        elif key == 'text-align':
281
            pass
282 283 284 285 286 287 288 289 290 291 292
        elif key == 'text-decoration':
            if value == 'underline':
                shell += '\x19u'
            elif value == 'blink':
                shell += '\x19a'
    return shell

def trim(string):
    return re.sub(whitespace_re, ' ', string)

class XHTMLHandler(sax.ContentHandler):
293
    def __init__(self, force_ns=False, tmp_dir=None, extract_images=False):
294 295 296 297 298
        self.builder = []
        self.formatting = []
        self.attrs = []
        self.list_state = []
        self.is_pre = False
299
        self.a_start = 0
300 301
        # do not care about xhtml-in namespace
        self.force_ns = force_ns
302

303 304 305
        self.tmp_dir = tmp_dir
        self.extract_images = extract_images

306 307
    @property
    def result(self):
308 309
        sanitized = re.sub(poezio_color_double, r'\1', ''.join(self.builder).strip())
        return re.sub(poezio_format_trim, '\x19o', sanitized)
310 311 312 313 314 315 316 317 318 319 320 321 322

    def append_formatting(self, formatting):
        self.formatting.append(formatting)
        self.builder.append(formatting)

    def pop_formatting(self):
        self.formatting.pop()
        self.builder.append('\x19o' + ''.join(self.formatting))

    def characters(self, characters):
        self.builder.append(characters if self.is_pre else trim(characters))

    def startElementNS(self, name, _, attrs):
323
        if name[0] != XHTML_NS and not self.force_ns:
324 325 326 327 328 329 330 331 332 333 334 335 336
            return

        builder = self.builder
        attrs = {name: value for ((ns, name), value) in attrs.items() if ns is None}
        self.attrs.append(attrs)

        if 'style' in attrs:
            style = parse_css(attrs['style'])
            self.append_formatting(style)

        name = name[1]
        if name == 'a':
            self.append_formatting('\x19u')
337
            self.a_start = len(self.builder)
338 339 340
        elif name == 'blockquote':
            builder.append('“')
        elif name == 'br':
341
            builder.append('\n')
342 343 344 345 346
        elif name == 'cite':
            self.append_formatting('\x19u')
        elif name == 'em':
            self.append_formatting('\x19i')
        elif name == 'img':
347 348 349 350 351 352 353 354 355
            if re.match(xhtml_data_re, attrs['src']) and self.extract_images:
                type_, data = [i for i in re.split(xhtml_data_re, attrs['src']) if i]
                bin_data = base64.b64decode(unquote(data))
                filename = hashlib.sha1(bin_data).hexdigest() + '.' + type_
                filepath = path.join(self.tmp_dir, filename)
                if not path.exists(filepath):
                    try:
                        with open(filepath, 'wb') as fd:
                            fd.write(bin_data)
356
                        builder.append('[file stored as %s]' % filename)
357 358 359
                    except Exception as e:
                        builder.append('[Error while saving image: %s]' % e)
                else:
360
                    builder.append('[file stored as %s]' % filename)
361 362
            else:
                builder.append(trim(attrs['src']))
363 364 365 366 367 368 369 370 371 372 373 374 375
            if 'alt' in attrs:
                builder.append(' (%s)' % trim(attrs['alt']))
        elif name == 'ul':
            self.list_state.append('ul')
        elif name == 'ol':
            self.list_state.append(1)
        elif name == 'li':
            try:
                state = self.list_state[-1]
            except IndexError:
                state = 'ul'
            if state == 'ul':
                builder.append('\n• ')
376
            else:
377 378 379 380 381 382 383 384 385 386
                builder.append('\n%d) ' % state)
                state += 1
                self.list_state[-1] = state
        elif name == 'p':
            builder.append('\n')
        elif name == 'pre':
            builder.append('\n')
            self.is_pre = True
        elif name == 'strong':
            self.append_formatting('\x19b')
387

388
    def endElementNS(self, name, _):
389
        if name[0] != XHTML_NS and not self.force_ns:
390
            return
391

392 393 394
        builder = self.builder
        attrs = self.attrs.pop()
        name = name[1]
395

396 397
        if name == 'a':
            self.pop_formatting()
398
            # do not display the link twice
399 400
            text_elements = [x for x in self.builder[self.a_start:]
                               if not x.startswith('\x19')]
401 402
            link_text = ''.join(text_elements).strip()
            if 'href' in attrs and attrs['href'] != link_text:
403 404
                builder.append(' (%s)' % trim(attrs['href']))
        elif name == 'blockquote':
405
            builder.append('”')
406 407 408 409 410 411 412 413 414 415 416 417 418 419
        elif name in ('cite', 'em', 'strong'):
            self.pop_formatting()
        elif name in ('ol', 'p', 'ul'):
            builder.append('\n')
        elif name == 'pre':
            builder.append('\n')
            self.is_pre = False

        if 'style' in attrs:
            self.pop_formatting()

        if 'title' in attrs:
            builder.append(' [' + attrs['title'] + ']')

420
def xhtml_to_poezio_colors(xml, force=False, tmp_dir=None, extract_images=None):
421 422 423 424 425
    if isinstance(xml, str):
        xml = xml.encode('utf8')
    elif not isinstance(xml, bytes):
        xml = ET.tostring(xml)

426 427
    handler = XHTMLHandler(force_ns=force, tmp_dir=tmp_dir,
                           extract_images=extract_images)
428 429 430 431 432
    parser = sax.make_parser()
    parser.setFeature(sax.handler.feature_namespaces, True)
    parser.setContentHandler(handler)
    parser.parse(BytesIO(xml))
    return handler.result
433

434 435 436 437 438 439 440 441 442
def clean_text(s):
    """
    Remove all xhtml-im attributes (\x19etc) from the string with the
    complete color format, i.e \x19xxx}
    """
    s = re.sub(xhtml_attr_re, "", s)
    return s

def clean_text_simple(string):
443
    """
444 445
    Remove all \x19 from the string formatted with simple colors:
    \x198
446 447 448 449 450 451 452
    """
    pos = string.find('\x19')
    while pos != -1:
        string = string[:pos] + string[pos+2:]
        pos = string.find('\x19')
    return string

453 454 455 456 457
def convert_simple_to_full_colors(text):
    """
    takes a \x19n formatted string and returns
    a \x19n} formatted one.
    """
louiz’'s avatar
louiz’ committed
458 459 460
    # TODO, have a single list of this. This is some sort of
    # dusplicate from windows.format_chars
    mapping = str.maketrans({'\x0E': '\x19b', '\x0F': '\x19o', '\x10': '\x19u',
461 462 463
                             '\x11': '\x191', '\x12': '\x192', '\x13': '\x193',
                             '\x14': '\x194', '\x15': '\x195', '\x16': '\x196',
                             '\x17': '\x197', '\x18': '\x198', '\x19': '\x199'})
louiz’'s avatar
louiz’ committed
464
    text = text.translate(mapping)
465 466 467 468
    def add_curly_bracket(match):
        return match.group(0) + '}'
    return re.sub(xhtml_simple_attr_re, add_curly_bracket, text)

louiz’'s avatar
louiz’ committed
469 470 471
number_to_color_names = {
    1: 'red',
    2: 'green',
472 473 474 475 476
    3: 'yellow',
    4: 'blue',
    5: 'violet',
    6: 'turquoise',
    7: 'white'
louiz’'s avatar
louiz’ committed
477 478
}

479 480 481
def format_inline_css(_dict):
    return ''.join(('%s: %s;' % (key, value) for key, value in _dict.items()))

louiz’'s avatar
louiz’ committed
482 483
def poezio_colors_to_html(string):
    """
484 485
    Convert poezio colors to html
    (e.g. \x191}: <span style='color: red'>)
louiz’'s avatar
louiz’ committed
486
    """
487 488 489 490 491
    # Maintain a list of the current css attributes used
    # And check if a tag is open (by design, we only open
    # spans tag, and they cannot be nested.
    current_attrs = {}
    tag_open = False
louiz’'s avatar
louiz’ committed
492
    next_attr_char = string.find('\x19')
493 494 495 496 497 498 499 500 501 502 503
    build = ["<body xmlns='http://www.w3.org/1999/xhtml'><p>"]

    def check_property(key, value):
        nonlocal tag_open
        if current_attrs.get(key, None) == value:
            return
        current_attrs[key] = value
        if tag_open:
            tag_open = False
            build.append('</span>')

louiz’'s avatar
louiz’ committed
504 505
    while next_attr_char != -1:
        attr_char = string[next_attr_char+1].lower()
506 507 508 509 510

        if next_attr_char != 0 and string[:next_attr_char]:
            if current_attrs and not tag_open:
                build.append('<span style="%s">' % format_inline_css(current_attrs))
                tag_open = True
511
            build.append(saxutils.escape(string[:next_attr_char]))
512

louiz’'s avatar
louiz’ committed
513
        if attr_char == 'o':
514 515 516 517
            if tag_open:
                build.append('</span>')
                tag_open = False
            current_attrs = {}
louiz’'s avatar
louiz’ committed
518
        elif attr_char == 'b':
519
            check_property('font-weight', 'bold')
520
        elif attr_char == 'u':
521 522
            check_property('text-decoration', 'underline')

523
        if attr_char in digits:
524 525 526 527 528 529 530
            number_str = string[next_attr_char+1:string.find('}', next_attr_char)]
            number = int(number_str)
            if number in number_to_color_names:
                check_property('color', number_to_color_names.get(number, 'black'))
            else:
                check_property('color', ncurses_color_to_html(number))
            string = string[next_attr_char+len(number_str)+2:]
531 532
        else:
            string = string[next_attr_char+2:]
louiz’'s avatar
louiz’ committed
533
        next_attr_char = string.find('\x19')
534 535 536 537

    if current_attrs and not tag_open and string:
        build.append('<span style="%s">' % format_inline_css(current_attrs))
        tag_open = True
538
    build.append(saxutils.escape(string))
539 540 541 542 543
    if tag_open:
        build.append('</span>')
    build.append("</p></body>")
    text = ''.join(build)
    return text.replace('\n', '<br />')