You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

341 lines
9.0 KiB

#!/usr/bin/python
#
# Urwid unicode character processing tables
# Copyright (C) 2004-2006 Ian Ward
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# Urwid web site: http://excess.org/urwid/
import re
SAFE_ASCII_RE = re.compile("^[ -~]*$")
_byte_encoding = None
# GENERATED DATA
# generated from
# http://www.unicode.org/Public/4.0-Update/EastAsianWidth-4.0.0.txt
widths = [
(126, 1),
(159, 0),
(687, 1),
(710, 0),
(711, 1),
(727, 0),
(733, 1),
(879, 0),
(1154, 1),
(1161, 0),
(4347, 1),
(4447, 2),
(7467, 1),
(7521, 0),
(8369, 1),
(8426, 0),
(9000, 1),
(9002, 2),
(11021, 1),
(12350, 2),
(12351, 1),
(12438, 2),
(12442, 0),
(19893, 2),
(19967, 1),
(55203, 2),
(63743, 1),
(64106, 2),
(65039, 1),
(65059, 0),
(65131, 2),
(65279, 1),
(65376, 2),
(65500, 1),
(65510, 2),
(120831, 1),
(262141, 2),
(1114109, 1),
]
# ACCESSOR FUNCTIONS
def get_width( o ):
"""Return the screen column width for unicode ordinal o."""
global widths
if o == 0xe or o == 0xf:
return 0
for num, wid in widths:
if o <= num:
return wid
return 1
def decode_one( text, pos ):
"""Return (ordinal at pos, next position) for UTF-8 encoded text."""
b1 = ord(text[pos])
if not b1 & 0x80:
return b1, pos+1
error = ord("?"), pos+1
lt = len(text)
lt = lt-pos
if lt < 2:
return error
if b1 & 0xe0 == 0xc0:
b2 = ord(text[pos+1])
if b2 & 0xc0 != 0x80:
return error
o = ((b1&0x1f)<<6)|(b2&0x3f)
if o < 0x80:
return error
return o, pos+2
if lt < 3:
return error
if b1 & 0xf0 == 0xe0:
b2 = ord(text[pos+1])
if b2 & 0xc0 != 0x80:
return error
b3 = ord(text[pos+2])
if b3 & 0xc0 != 0x80:
return error
o = ((b1&0x0f)<<12)|((b2&0x3f)<<6)|(b3&0x3f)
if o < 0x800:
return error
return o, pos+3
if lt < 4:
return error
if b1 & 0xf8 == 0xf0:
b2 = ord(text[pos+1])
if b2 & 0xc0 != 0x80:
return error
b3 = ord(text[pos+2])
if b3 & 0xc0 != 0x80:
return error
b4 = ord(text[pos+2])
if b4 & 0xc0 != 0x80:
return error
o = ((b1&0x07)<<18)|((b2&0x3f)<<12)|((b3&0x3f)<<6)|(b4&0x3f)
if o < 0x10000:
return error
return o, pos+4
return error
def decode_one_right( text, pos):
"""
Return (ordinal at pos, next position) for UTF-8 encoded text.
pos is assumed to be on the trailing byte of a utf-8 sequence."""
error = ord("?"), pos-1
p = pos
while p >= 0:
if ord(text[p])&0xc0 != 0x80:
o, next = decode_one( text, p )
return o, p-1
p -=1
if p == p-4:
return error
def set_byte_encoding(enc):
assert enc in ('utf8', 'narrow', 'wide')
global _byte_encoding
_byte_encoding = enc
def get_byte_encoding():
return _byte_encoding
def calc_text_pos( text, start_offs, end_offs, pref_col ):
"""
Calculate the closest position to the screen column pref_col in text
where start_offs is the offset into text assumed to be screen column 0
and end_offs is the end of the range to search.
Returns (position, actual_col).
"""
assert start_offs <= end_offs, `start_offs, end_offs`
utfs = (type(text) == type("") and _byte_encoding == "utf8")
if type(text) == type(u"") or utfs:
i = start_offs
sc = 0
n = 1 # number to advance by
while i < end_offs:
if utfs:
o, n = decode_one(text, i)
else:
o = ord(text[i])
n = i + 1
w = get_width(o)
if w+sc > pref_col:
return i, sc
i = n
sc += w
return i, sc
assert type(text) == type(""), `text`
# "wide" and "narrow"
i = start_offs+pref_col
if i >= end_offs:
return end_offs, end_offs-start_offs
if _byte_encoding == "wide":
if within_double_byte( text, start_offs, i ) == 2:
i -= 1
return i, i-start_offs
def calc_width( text, start_offs, end_offs ):
"""
Return the screen column width of text between start_offs and end_offs.
"""
assert start_offs <= end_offs, `start_offs, end_offs`
utfs = (type(text) == type("") and _byte_encoding == "utf8")
if (type(text) == type(u"") or utfs) and not SAFE_ASCII_RE.match(text):
i = start_offs
sc = 0
n = 1 # number to advance by
while i < end_offs:
if utfs:
o, n = decode_one(text, i)
else:
o = ord(text[i])
n = i + 1
w = get_width(o)
i = n
sc += w
return sc
# "wide" and "narrow"
return end_offs - start_offs
def is_wide_char( text, offs ):
"""
Test if the character at offs within text is wide.
"""
if type(text) == type(u""):
o = ord(text[offs])
return get_width(o) == 2
assert type(text) == type("")
if _byte_encoding == "utf8":
o, n = decode_one(text, offs)
return get_width(o) == 2
if _byte_encoding == "wide":
return within_double_byte(text, offs, offs) == 1
return False
def move_prev_char( text, start_offs, end_offs ):
"""
Return the position of the character before end_offs.
"""
assert start_offs < end_offs
if type(text) == type(u""):
return end_offs-1
assert type(text) == type("")
if _byte_encoding == "utf8":
o = end_offs-1
while ord(text[o])&0xc0 == 0x80:
o -= 1
return o
if _byte_encoding == "wide" and within_double_byte( text,
start_offs, end_offs-1) == 2:
return end_offs-2
return end_offs-1
def move_next_char( text, start_offs, end_offs ):
"""
Return the position of the character after start_offs.
"""
assert start_offs < end_offs
if type(text) == type(u""):
return start_offs+1
assert type(text) == type("")
if _byte_encoding == "utf8":
o = start_offs+1
while o<end_offs and ord(text[o])&0xc0 == 0x80:
o += 1
return o
if _byte_encoding == "wide" and within_double_byte(text,
start_offs, start_offs) == 1:
return start_offs +2
return start_offs+1
def within_double_byte(str, line_start, pos):
"""Return whether pos is within a double-byte encoded character.
str -- string in question
line_start -- offset of beginning of line (< pos)
pos -- offset in question
Return values:
0 -- not within dbe char, or double_byte_encoding == False
1 -- pos is on the 1st half of a dbe char
2 -- pos is on the 2nd half og a dbe char
"""
v = ord(str[pos])
if v >= 0x40 and v < 0x7f:
# might be second half of big5, uhc or gbk encoding
if pos == line_start: return 0
if ord(str[pos-1]) >= 0x81:
if within_double_byte(str, line_start, pos-1) == 1:
return 2
return 0
if v < 0x80: return 0
i = pos -1
while i >= line_start:
if ord(str[i]) < 0x80:
break
i -= 1
if (pos - i) & 1:
return 1
return 2
# TABLE GENERATION CODE
def process_east_asian_width():
import sys
out = []
last = None
for line in sys.stdin.readlines():
if line[:1] == "#": continue
line = line.strip()
hex,rest = line.split(";",1)
wid,rest = rest.split(" # ",1)
word1 = rest.split(" ",1)[0]
if "." in hex:
hex = hex.split("..")[1]
num = int(hex, 16)
if word1 in ("COMBINING","MODIFIER","<control>"):
l = 0
elif wid in ("W", "F"):
l = 2
else:
l = 1
if last is None:
out.append((0, l))
last = l
if last == l:
out[-1] = (num, l)
else:
out.append( (num, l) )
last = l
print "widths = ["
for o in out[1:]: # treat control characters same as ascii
print "\t"+`o`+","
print "]"
if __name__ == "__main__":
process_east_asian_width()