Files#
Table of Contents#
Programming Environment#
from html.entities import codepoint2name
import string
from typing import List, Union
import unicodedata
from datetime import datetime
import locale as l
import platform as p
import sys
print(datetime.now())
print()
print(f"{'Platform':<20}: {p.mac_ver()[0]} | {p.system()} | {p.release()} | {p.machine()}")
print(f"{'':<20}: {l.getpreferredencoding()}")
print()
print(f"{'Python':<20}: {sys.version}")
print(f"{ '':<20}: {sys.version_info}")
print(f"{ '':<20}: {p.python_implementation()}")
2024-05-21 15:42:47.215239
Platform : 14.4.1 | Darwin | 23.4.0 | arm64
: UTF-8
Python : 3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:34:54) [Clang 16.0.6 ]
: sys.version_info(major=3, minor=11, micro=9, releaselevel='final', serial=0)
: CPython
Auxiliary#
Show code cell source
def dec_to_hex (dec : int = 2**16 - 1) -> str:
""" Compose the hexadecimal representation
as a string
of a nonnegative integer.
params: int (dec)
return: str
"""
assert 0 <= dec, 'Try again with a nonnegative integer.'
return format(dec, '>06x').upper()
test_cases = [
-1, 0, 1, 2**16 - 1, 2**32 - 1,
]
for test_case in test_cases:
try:
print(f"Case {test_case:<10}: {repr(dec_to_hex(test_case))}")
except AssertionError as e:
print(f"Case {test_case:<10}: {e}")
Show code cell output
Case -1 : Try again with a nonnegative integer.
Case 0 : '000000'
Case 1 : '000001'
Case 65535 : '00FFFF'
Case 4294967295: 'FFFFFFFF'
Show code cell source
def to_codepoint (hexa : str = '10FFFF') -> str:
""" Compose a Unicode code point
as a string.
params: str (hexa)
return: str
"""
assert int(hexa, base=16) <= 0x10FFFF, 'Try again with a valid code point.'
return fr'\U00{hexa}'
test_cases = [
dec_to_hex(i) for i in range(5)
]
for test_case in test_cases:
try:
print(f"Case {test_case:<10}: {repr(to_codepoint(test_case))}")
except AssertionError as e:
print(f"Case {test_case:<10}: {e}")
Show code cell output
Case 000000 : '\\U00000000'
Case 000001 : '\\U00000001'
Case 000002 : '\\U00000002'
Case 000003 : '\\U00000003'
Case 000004 : '\\U00000004'
Show code cell source
# Nonnegative integer no greater than 0x10FFFF (1_114_111)
dec_to_glyph = chr
print(repr(dec_to_glyph(0xFF)))
Show code cell output
'ÿ'
Show code cell source
def to_glyph (code_point : str = '\\u00FF') -> str:
""" Convert a raw Unicode code point to its non raw (graphical) form.
params: str (code_point)
return: str
"""
return code_point.encode('utf-8').decode('unicode-escape')
print(to_glyph())
Show code cell output
ÿ
Show code cell source
def print_code_point_information (points : str = 'hello world') -> None:
""" Prints information about Unicode code points.
Prints the sequence of code points
the number of code points in the sequence
the following information for each Unicode code point in a sequence of code points:
* glyph
* raw code point
* hexadecimal repr
* byte repr
* Unicode category
* Named entity repr
* Unicode name
params: str (points)
return: None
"""
print(points)
print(len(points))
print()
print(f"{'Glyph':<10} "
f"{'Code Point':<10} "
f"{'Hex':<10} "
f"{'Bytes':<20} "
f"{'Category':<10} "
f"{'Named Entity':<20} "
f"{'Name':<10}")
try:
for point in points:
hex_rep = dec_to_hex(ord(point))
code_point = to_codepoint(hex_rep)
glyph = to_glyph(code_point)
unicode = (f"{chr(ord(point)):<10} "
f"{code_point:<10} "
f"{format(ord(point), '06x').upper():<10} "
f"{str(point.encode('utf-8')):<20} "
f"{unicodedata.category(chr(ord(point))):<10} ")
try:
unicode += f"{codepoint2name[ord(point)]:<20} "
except KeyError as e:
unicode += f"{'NO NAMED ENTITY':<20} "
try:
unicode += f"{unicodedata.name(point)}"
except ValueError as e:
unicode += f"NO UNICODE NAME"
print(unicode)
except AssertionError as e:
print(f"Case {point}: {e}")
print_code_point_information()
Show code cell output
hello world
11
Glyph Code Point Hex Bytes Category Named Entity Name
h \U00000068 000068 b'h' Ll NO NAMED ENTITY LATIN SMALL LETTER H
e \U00000065 000065 b'e' Ll NO NAMED ENTITY LATIN SMALL LETTER E
l \U0000006C 00006C b'l' Ll NO NAMED ENTITY LATIN SMALL LETTER L
l \U0000006C 00006C b'l' Ll NO NAMED ENTITY LATIN SMALL LETTER L
o \U0000006F 00006F b'o' Ll NO NAMED ENTITY LATIN SMALL LETTER O
\U00000020 000020 b' ' Zs NO NAMED ENTITY SPACE
w \U00000077 000077 b'w' Ll NO NAMED ENTITY LATIN SMALL LETTER W
o \U0000006F 00006F b'o' Ll NO NAMED ENTITY LATIN SMALL LETTER O
r \U00000072 000072 b'r' Ll NO NAMED ENTITY LATIN SMALL LETTER R
l \U0000006C 00006C b'l' Ll NO NAMED ENTITY LATIN SMALL LETTER L
d \U00000064 000064 b'd' Ll NO NAMED ENTITY LATIN SMALL LETTER D
Text Encoding#
A text encoding is a text serialization codec encoding text to bytes and decoding bytes to text.
Encoding is the serialization of a string into a sequence of bytes and decoding is the deserialization of a sequence of bytes into a string.
ASCII#
[ w ] Basic Latin
128 code points 0-127 are mapped to bytes 0x0-0x80 where the first bit is a placeholder and the remaining 7 bits encode the code point.
Code Points |
Encoding |
---|---|
|
|
33 control codes
95 printable characters
26 uppercase letters
26 lowercase letters
10 digits
32 punctuation
1 whitespace
26 * 2 + 10 + 32 + 1 + 33
128
Control Codes#
Control Characters
[ u ] Control Pictures
[ w ] NUL
[ w ] ETX
[ w ] EOT
[ w ] ENQ
[ w ] ACK
[ w ] BEL
[ w ] BS
[ w ] HT
[ w ] LF
[ w ] FF
[ w ] CR
[ w ] SO
[ w ] SI
[ w ] NAK
[ w ] SYN
[ w ] ETB
[ w ] CAN
[ w ] SUB
[ w ] ESC
[ w ] DEL
ASCII |
Abbreviation |
Caret Notation |
Signal |
Escape |
HTML Entity |
Percent Code |
Unicode |
Unicode Name |
---|---|---|---|---|---|---|---|---|
0 |
NUL |
^@ |
\0 |
|
%00 |
U+0000 |
NULL |
|
1 |
SOH |
^A |
|
U+0001 |
START OF HEADING |
|||
2 |
STX |
^B |
|
U+0002 |
START OF TEXT |
|||
3 |
ETX |
^C |
Ctrl-C |
|
U+0003 |
END OF TEXT |
||
4 |
EOT |
^D |
Ctrl-D |
|
U+0004 |
END OF TRANSMISSION |
||
5 |
ENQ |
^E |
|
U+0005 |
ENQUIRY |
|||
6 |
ACK |
^F |
|
U+0006 |
ACKNOWLEDGE |
|||
7 |
BEL |
^G |
\a |
|
U+0007 |
BELL |
||
8 |
BS |
^H |
\b |
|
U+0008 |
BACKSPACE |
||
9 |
HT |
^I |
\t |
|
U+0009 |
CHARACTER TABULATION (horizontal tabulation, tab) |
||
10 |
LF |
^J |
\n |
|
U+000A |
LINE FEED (new line NL, end of line EOL) |
||
11 |
VT |
^K |
\v |
|
U+000B |
LINE TABULATION (vertical tabulation) |
||
12 |
FF |
^L |
Ctrl-L |
\f |
|
U+000C |
FORM FEED |
|
13 |
CR |
^M |
\r |
|
U+000D |
CARRIAGE RETURN |
||
14 |
SO |
^N |
|
U+000E |
SHIFT OUT (locking-shift one) |
|||
15 |
SI |
^O |
|
U+000F |
SHIFT IN (locking-shift zero) |
|||
16 |
DLE |
^P |
|
U+0010 |
DATA LINK ESCAPE |
|||
17 |
DC1 |
^Q |
|
U+0011 |
DEVICE CONTROL ONE |
|||
18 |
DC2 |
^R |
|
U+0012 |
DEVICE CONTROL TWO |
|||
19 |
DC3 |
^S |
|
U+0013 |
DEVICE CONTROL THREE |
|||
20 |
DC4 |
^T |
|
U+0014 |
DEVICE CONTROL FOUR |
|||
21 |
NAK |
^U |
|
U+0015 |
NEGATIVE ACKNOWLEDGE |
|||
22 |
SYN |
^V |
|
U+0016 |
SYNCHRONOUS IDLE |
|||
23 |
ETB |
^W |
|
U+0017 |
END OF TRANSMISSION BLOCK |
|||
24 |
CAN |
^X |
|
U+0018 |
CANCEL |
|||
25 |
EM |
^Y |
|
U+0019 |
END OF MEDIUM |
|||
26 |
SUB |
^Z |
Ctrl-Z |
|
U+001A |
SUBSTITUTE |
||
27 |
ESC |
^[ |
\e |
|
U+001B |
ESCAPE |
||
28 |
FS |
^\ |
|
U+001C |
INFORMATION SEPARATOR FOUR (file separator) |
|||
29 |
GS |
^] |
|
U+001D |
INFORMATION SEPARATOR THREE (group separator) |
|||
30 |
RS |
^^ |
|
U+001E |
INFORMATION SEPARATOR TWO (record separator) |
|||
31 |
US |
^_ |
|
U+001F |
INFORMATION SEPARATOR ONE (unit separator) |
|||
127 |
DEL |
^? |
|
U+007F |
DELETE |
ASCII |
Control Picture |
HTML Entity |
Unicode |
Unicode Name |
---|---|---|---|---|
0 |
␀ |
|
U+2400 |
SYMBOL FOR NULL |
1 |
␁ |
|
U+2401 |
SYMBOL FOR START OF HEADING |
2 |
␂ |
|
U+2402 |
SYMBOL FOR START OF TEXT |
3 |
␃ |
|
U+2403 |
SYMBOL FOR END OF TEXT |
4 |
␄ |
|
U+2404 |
SYMBOL FOR END OF TRANSMISSION |
5 |
␅ |
|
U+2405 |
SYMBOL FOR ENQUIRY |
6 |
␆ |
|
U+2406 |
SYMBOL FOR ACKNOWLEDGE |
7 |
␇ |
|
U+2407 |
SYMBOL FOR BELL |
8 |
␈ |
|
U+2408 |
SYMBOL FOR BACKSPACE |
9 |
␉ |
|
U+2409 |
SYMBOL FOR HORIZONTAL TABULATION |
10 |
␊ |
|
U+240A |
SYMBOL FOR LINE FEED |
11 |
␋ |
|
U+240B |
SYMBOL FOR VERTICAL TABULATION |
12 |
␌ |
|
U+240C |
SYMBOL FOR FORM FEED |
13 |
␍ |
|
U+240D |
SYMBOL FOR CARRIAGE RETURN |
14 |
␎ |
|
U+240E |
SYMBOL FOR SHIFT OUT |
15 |
␏ |
|
U+240F |
SYMBOL FOR SHIFT IN |
16 |
␐ |
|
U+2410 |
SYMBOL FOR DATA LINK ESCAPE |
17 |
␑ |
|
U+2411 |
SYMBOL FOR DEVICE CONTROL ONE |
18 |
␒ |
|
U+2412 |
SYMBOL FOR DEVICE CONTROL TWO |
19 |
␓ |
|
U+2413 |
SYMBOL FOR DEVICE CONTROL THREE |
20 |
␔ |
|
U+2414 |
SYMBOL FOR DEVICE CONTROL FOUR |
21 |
␕ |
|
U+2415 |
SYMBOL FOR NEGATIVE ACKNOWLEDGE |
22 |
␖ |
|
U+2416 |
SYMBOL FOR SYNCHRONOUS IDLE |
23 |
␗ |
|
U+2417 |
SYMBOL FOR END OF TRANSMISSION BLOCK |
24 |
␘ |
|
U+2418 |
SYMBOL FOR CANCEL |
25 |
␙ |
|
U+2419 |
SYMBOL FOR END OF MEDIUM |
26 |
␚ |
|
U+241A |
SYMBOL FOR SUBSTITUTE |
27 |
␛ |
|
U+241B |
SYMBOL FOR ESCAPE |
28 |
␜ |
|
U+241C |
SYMBOL FOR FILE SEPARATOR |
29 |
␝ |
|
U+241D |
SYMBOL FOR GROUP SEPARATOR |
30 |
␞ |
|
U+241E |
SYMBOL FOR RECORD SEPARATOR |
31 |
␟ |
|
U+241F |
SYMBOL FOR UNIT SEPARATOR |
32 |
␠ |
|
U+2420 |
SYMBOL FOR SPACE |
127 |
␡ |
|
U+2421 |
SYMBOL FOR DELETE |
Uppercase Letters#
ASCII |
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|---|
65 |
A |
|
U+0041 |
LATIN CAPITAL LETTER A |
66 |
B |
|
U+0042 |
LATIN CAPITAL LETTER B |
67 |
C |
|
U+0043 |
LATIN CAPITAL LETTER C |
68 |
D |
|
U+0044 |
LATIN CAPITAL LETTER D |
69 |
E |
|
U+0045 |
LATIN CAPITAL LETTER E |
70 |
F |
|
U+0046 |
LATIN CAPITAL LETTER F |
71 |
G |
|
U+0047 |
LATIN CAPITAL LETTER G |
72 |
H |
|
U+0048 |
LATIN CAPITAL LETTER H |
73 |
I |
|
U+0049 |
LATIN CAPITAL LETTER I |
74 |
J |
|
U+004A |
LATIN CAPITAL LETTER J |
75 |
K |
|
U+004B |
LATIN CAPITAL LETTER K |
76 |
L |
|
U+004C |
LATIN CAPITAL LETTER L |
77 |
M |
|
U+004D |
LATIN CAPITAL LETTER M |
78 |
N |
|
U+004E |
LATIN CAPITAL LETTER N |
79 |
O |
|
U+004F |
LATIN CAPITAL LETTER O |
80 |
P |
|
U+0050 |
LATIN CAPITAL LETTER P |
81 |
Q |
|
U+0051 |
LATIN CAPITAL LETTER Q |
82 |
R |
|
U+0052 |
LATIN CAPITAL LETTER R |
83 |
S |
|
U+0053 |
LATIN CAPITAL LETTER S |
84 |
T |
|
U+0054 |
LATIN CAPITAL LETTER T |
85 |
U |
|
U+0055 |
LATIN CAPITAL LETTER U |
86 |
V |
|
U+0056 |
LATIN CAPITAL LETTER V |
87 |
W |
|
U+0057 |
LATIN CAPITAL LETTER W |
88 |
X |
|
U+0058 |
LATIN CAPITAL LETTER X |
89 |
Y |
|
U+0059 |
LATIN CAPITAL LETTER Y |
90 |
Z |
|
U+005A |
LATIN CAPITAL LETTER Z |
print_code_point_information(string.ascii_uppercase)
ABCDEFGHIJKLMNOPQRSTUVWXYZ
26
Glyph Code Point Hex Bytes Category Named Entity Name
A \U00000041 000041 b'A' Lu NO NAMED ENTITY LATIN CAPITAL LETTER A
B \U00000042 000042 b'B' Lu NO NAMED ENTITY LATIN CAPITAL LETTER B
C \U00000043 000043 b'C' Lu NO NAMED ENTITY LATIN CAPITAL LETTER C
D \U00000044 000044 b'D' Lu NO NAMED ENTITY LATIN CAPITAL LETTER D
E \U00000045 000045 b'E' Lu NO NAMED ENTITY LATIN CAPITAL LETTER E
F \U00000046 000046 b'F' Lu NO NAMED ENTITY LATIN CAPITAL LETTER F
G \U00000047 000047 b'G' Lu NO NAMED ENTITY LATIN CAPITAL LETTER G
H \U00000048 000048 b'H' Lu NO NAMED ENTITY LATIN CAPITAL LETTER H
I \U00000049 000049 b'I' Lu NO NAMED ENTITY LATIN CAPITAL LETTER I
J \U0000004A 00004A b'J' Lu NO NAMED ENTITY LATIN CAPITAL LETTER J
K \U0000004B 00004B b'K' Lu NO NAMED ENTITY LATIN CAPITAL LETTER K
L \U0000004C 00004C b'L' Lu NO NAMED ENTITY LATIN CAPITAL LETTER L
M \U0000004D 00004D b'M' Lu NO NAMED ENTITY LATIN CAPITAL LETTER M
N \U0000004E 00004E b'N' Lu NO NAMED ENTITY LATIN CAPITAL LETTER N
O \U0000004F 00004F b'O' Lu NO NAMED ENTITY LATIN CAPITAL LETTER O
P \U00000050 000050 b'P' Lu NO NAMED ENTITY LATIN CAPITAL LETTER P
Q \U00000051 000051 b'Q' Lu NO NAMED ENTITY LATIN CAPITAL LETTER Q
R \U00000052 000052 b'R' Lu NO NAMED ENTITY LATIN CAPITAL LETTER R
S \U00000053 000053 b'S' Lu NO NAMED ENTITY LATIN CAPITAL LETTER S
T \U00000054 000054 b'T' Lu NO NAMED ENTITY LATIN CAPITAL LETTER T
U \U00000055 000055 b'U' Lu NO NAMED ENTITY LATIN CAPITAL LETTER U
V \U00000056 000056 b'V' Lu NO NAMED ENTITY LATIN CAPITAL LETTER V
W \U00000057 000057 b'W' Lu NO NAMED ENTITY LATIN CAPITAL LETTER W
X \U00000058 000058 b'X' Lu NO NAMED ENTITY LATIN CAPITAL LETTER X
Y \U00000059 000059 b'Y' Lu NO NAMED ENTITY LATIN CAPITAL LETTER Y
Z \U0000005A 00005A b'Z' Lu NO NAMED ENTITY LATIN CAPITAL LETTER Z
Lowercase Letters#
ASCII |
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|---|
97 |
a |
|
U+0061 |
LATIN SMALL LETTER A |
98 |
b |
|
U+0062 |
LATIN SMALL LETTER B |
99 |
c |
|
U+0063 |
LATIN SMALL LETTER C |
100 |
d |
|
U+0064 |
LATIN SMALL LETTER D |
101 |
e |
|
U+0065 |
LATIN SMALL LETTER E |
102 |
f |
|
U+0066 |
LATIN SMALL LETTER F |
103 |
g |
|
U+0067 |
LATIN SMALL LETTER G |
104 |
h |
|
U+0068 |
LATIN SMALL LETTER H |
105 |
i |
|
U+0069 |
LATIN SMALL LETTER I |
106 |
j |
|
U+006A |
LATIN SMALL LETTER J |
107 |
k |
|
U+006B |
LATIN SMALL LETTER K |
108 |
l |
|
U+006C |
LATIN SMALL LETTER L |
109 |
m |
|
U+006D |
LATIN SMALL LETTER M |
110 |
n |
|
U+006E |
LATIN SMALL LETTER N |
111 |
o |
|
U+006F |
LATIN SMALL LETTER O |
112 |
p |
|
U+0070 |
LATIN SMALL LETTER P |
113 |
q |
|
U+0071 |
LATIN SMALL LETTER Q |
114 |
r |
|
U+0072 |
LATIN SMALL LETTER R |
115 |
s |
|
U+0073 |
LATIN SMALL LETTER S |
116 |
t |
|
U+0074 |
LATIN SMALL LETTER T |
117 |
u |
|
U+0075 |
LATIN SMALL LETTER U |
118 |
v |
|
U+0076 |
LATIN SMALL LETTER V |
119 |
w |
|
U+0077 |
LATIN SMALL LETTER W |
120 |
x |
|
U+0078 |
LATIN SMALL LETTER X |
121 |
y |
|
U+0079 |
LATIN SMALL LETTER Y |
122 |
z |
|
U+007A |
LATIN SMALL LETTER Z |
print_code_point_information(string.ascii_lowercase)
abcdefghijklmnopqrstuvwxyz
26
Glyph Code Point Hex Bytes Category Named Entity Name
a \U00000061 000061 b'a' Ll NO NAMED ENTITY LATIN SMALL LETTER A
b \U00000062 000062 b'b' Ll NO NAMED ENTITY LATIN SMALL LETTER B
c \U00000063 000063 b'c' Ll NO NAMED ENTITY LATIN SMALL LETTER C
d \U00000064 000064 b'd' Ll NO NAMED ENTITY LATIN SMALL LETTER D
e \U00000065 000065 b'e' Ll NO NAMED ENTITY LATIN SMALL LETTER E
f \U00000066 000066 b'f' Ll NO NAMED ENTITY LATIN SMALL LETTER F
g \U00000067 000067 b'g' Ll NO NAMED ENTITY LATIN SMALL LETTER G
h \U00000068 000068 b'h' Ll NO NAMED ENTITY LATIN SMALL LETTER H
i \U00000069 000069 b'i' Ll NO NAMED ENTITY LATIN SMALL LETTER I
j \U0000006A 00006A b'j' Ll NO NAMED ENTITY LATIN SMALL LETTER J
k \U0000006B 00006B b'k' Ll NO NAMED ENTITY LATIN SMALL LETTER K
l \U0000006C 00006C b'l' Ll NO NAMED ENTITY LATIN SMALL LETTER L
m \U0000006D 00006D b'm' Ll NO NAMED ENTITY LATIN SMALL LETTER M
n \U0000006E 00006E b'n' Ll NO NAMED ENTITY LATIN SMALL LETTER N
o \U0000006F 00006F b'o' Ll NO NAMED ENTITY LATIN SMALL LETTER O
p \U00000070 000070 b'p' Ll NO NAMED ENTITY LATIN SMALL LETTER P
q \U00000071 000071 b'q' Ll NO NAMED ENTITY LATIN SMALL LETTER Q
r \U00000072 000072 b'r' Ll NO NAMED ENTITY LATIN SMALL LETTER R
s \U00000073 000073 b's' Ll NO NAMED ENTITY LATIN SMALL LETTER S
t \U00000074 000074 b't' Ll NO NAMED ENTITY LATIN SMALL LETTER T
u \U00000075 000075 b'u' Ll NO NAMED ENTITY LATIN SMALL LETTER U
v \U00000076 000076 b'v' Ll NO NAMED ENTITY LATIN SMALL LETTER V
w \U00000077 000077 b'w' Ll NO NAMED ENTITY LATIN SMALL LETTER W
x \U00000078 000078 b'x' Ll NO NAMED ENTITY LATIN SMALL LETTER X
y \U00000079 000079 b'y' Ll NO NAMED ENTITY LATIN SMALL LETTER Y
z \U0000007A 00007A b'z' Ll NO NAMED ENTITY LATIN SMALL LETTER Z
# print_code_point_information(string.ascii_letters)
Digits#
ASCII |
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|---|
48 |
0 |
|
U+0030 |
DIGIT ZERO |
49 |
1 |
|
U+0031 |
DIGIT ONE |
50 |
2 |
|
U+0032 |
DIGIT TWO |
51 |
3 |
|
U+0033 |
DIGIT THREE |
52 |
4 |
|
U+0034 |
DIGIT FOUR |
53 |
5 |
|
U+0035 |
DIGIT FIVE |
54 |
6 |
|
U+0036 |
DIGIT SIX |
55 |
7 |
|
U+0037 |
DIGIT SEVEN |
56 |
8 |
|
U+0038 |
DIGIT EIGHT |
57 |
9 |
|
U+0039 |
DIGIT NINE |
print_code_point_information(string.digits)
0123456789
10
Glyph Code Point Hex Bytes Category Named Entity Name
0 \U00000030 000030 b'0' Nd NO NAMED ENTITY DIGIT ZERO
1 \U00000031 000031 b'1' Nd NO NAMED ENTITY DIGIT ONE
2 \U00000032 000032 b'2' Nd NO NAMED ENTITY DIGIT TWO
3 \U00000033 000033 b'3' Nd NO NAMED ENTITY DIGIT THREE
4 \U00000034 000034 b'4' Nd NO NAMED ENTITY DIGIT FOUR
5 \U00000035 000035 b'5' Nd NO NAMED ENTITY DIGIT FIVE
6 \U00000036 000036 b'6' Nd NO NAMED ENTITY DIGIT SIX
7 \U00000037 000037 b'7' Nd NO NAMED ENTITY DIGIT SEVEN
8 \U00000038 000038 b'8' Nd NO NAMED ENTITY DIGIT EIGHT
9 \U00000039 000039 b'9' Nd NO NAMED ENTITY DIGIT NINE
Punctuation#
ASCII |
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|---|
32 |
|
|
U+0020 |
SPACE [ w ] |
33 |
! |
|
U+0021 |
EXCLAMATION MARK (factorial, bang) [ w ] |
34 |
“ |
|
U+0022 |
QUOTATION MARK (double quote) [ w ] |
35 |
# |
|
U+0023 |
NUMBER SIGN (pound sign, hash) [ w ] |
36 |
$ |
|
U+0024 |
DOLLAR SIGN [ w ] |
37 |
% |
|
U+0025 |
PERCENT SIGN [ w ] |
38 |
& |
|
U+0026 |
AMPERSAND (and) [ w ] |
39 |
‘ |
|
U+0027 |
APOSTROPHE (single quote) |
40 |
( |
|
U+0028 |
LEFT PARENTHESIS (opening parenthesis) [ w ] |
41 |
) |
|
U+0029 |
RIGHT PARENTHESIS (closing parenthesis) [ w ] |
42 |
* |
|
U+002A |
ASTERISK (star) [ w ] (ἀστερίσκος “little star”) |
43 |
+ |
|
U+002B |
PLUS SIGN [ w ] |
44 |
, |
|
U+002C |
COMMA [ w ] |
45 |
- |
|
U+002D |
HYPHEN-MINUS [ w ] (hyphen [ w ], dash [ w ], minus sign [ w ]) |
46 |
. |
|
U+002E |
FULL STOP (period, dot, decimal point) [ w ] |
47 |
/ |
|
U+002F |
SOLIDUS (slash, forward slash) [ w ] |
58 |
: |
|
U+003A |
COLON [ w ] |
59 |
; |
|
U+003B |
SEMICOLON [ w ] |
60 |
< |
|
U+003C |
LESS-THAN SIGN [ w ] |
61 |
= |
|
U+003D |
EQUALS SIGN [ w ] |
62 |
> |
|
U+003E |
GREATER-THAN SIGN [ w ] |
63 |
? |
|
U+003F |
QUESTION MARK [ w ] |
64 |
@ |
|
U+0040 |
COMMERCIAL AT (at sign) [ w ] |
91 |
[ |
|
U+005B |
LEFT SQUARE BRACKET (opening square bracket) [ w ] |
92 |
\ |
|
U+005C |
REVERSE SOLIDUS (backslash) [ w ] |
93 |
] |
|
U+005D |
RIGHT SQUARE BRACKET (closing square bracket) [ w ] |
94 |
^ |
|
U+005E |
CIRCUMFLEX ACCENT (“caret”, “hat”) [ w ] |
95 |
_ |
|
U+005F |
LOW LINE (“underscore”) [ w ] |
96 |
` |
|
U+0060 |
GRAVE ACCENT (backtick, backquote) [ w ] |
123 |
{ |
|
U+00&B |
LEFT CURLY BRACKET (opening curly bracket, left brace) [ w ] |
124 |
| |
|
U+00&C |
VERTICAL LINE (vertical bar, pipe) [ w ] |
125 |
} |
|
U+00&D |
RIGHT CURLY BRACKET (closing curly bracket, right brace) [ w ] |
126 |
~ |
|
U+00&E |
TILDE [ w ] |
Dashes
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|
- |
|
U+002D |
HYPHEN-MINUS (hyphen, dash, minus sign) |
‐ |
|
U+2010 |
HYPHEN |
‒ |
|
U+2012 |
FIGURE DASH |
– |
|
U+2013 |
EN DASH |
— |
|
U+2014 |
EM DASH |
― |
|
U+2015 |
HORIZONTAL BAR |
− |
|
U+2212 |
MINUS SIGN |
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|
|
|
U+00AD |
SOFT HYPHEN |
˗ |
|
U+02D7 |
MODIFIER LETTER MINUS SIGN |
‑ |
|
U+2011 |
NON-BREAKING HYPHEN |
‧ |
|
U+2027 |
HYPHENATION POINT |
⁃ |
|
U+2043 |
HYPHEN BULLET |
𐆑 |
|
U+10191 |
ROMAN UNCIA SIGN |
Quotation Marks
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|
‘ |
|
U+2018 |
LEFT SINGLE QUOTATION MARK |
’ |
|
U+2019 |
RIGHT SINGLE QUOTATION MARK |
“ |
|
U+201C |
LEFT DOUBLE QUOTATION MARK |
” |
|
U+201D |
RIGHT DOUBLE QUOTATION MARK |
Symbol |
HTML Entity |
Unicode Code Point |
Unicode Name |
---|---|---|---|
“ |
|
U+0022 |
QUOTATION MARK (double quote) |
ʹ |
|
U+02B9 |
MODIFIER LETTER PRIME |
ʺ |
|
U+02BA |
MODIFIER LETTER DOUBLE PRIME |
ʼ |
|
U+02BC |
MODIFIER LETTER APOSTROPHE |
ˈ |
|
U+02C8 |
MODIFIER LETTER VERTICAL LINE |
˝ |
|
U+02DD |
DOUBLE ACCUTE ACCENT |
ˮ |
|
U+02EE |
MODIFIER LETTER DOUBLE APOSTROPHE |
́ |
|
U+0301 |
COMBINING ACUTE ACCENT |
̋ |
|
U+030B |
COMBINING DOUBLE ACUTE ACCENT |
̍ |
|
U+030D |
COMBINING VERTICAL LINE ABOVE |
̎ |
|
U+030E |
COMBINING DOUBLE VERTICAL LINE ABOVE |
׳ |
|
U+05F3 |
HEBREW PUNCTUATION GERESH |
״ |
|
U+05F4 |
HEBREW PUNCTUATION GERSHAYIM |
′ |
|
U+2032 |
PRIME |
″ |
|
U+2033 |
DOUBLE PRIME |
〃 |
|
U+3003 |
DITTO MARK |
ꞌ |
|
U+A78C |
LATIN SMALL LETTER SALTILLO |
print_code_point_information(string.punctuation)
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
32
Glyph Code Point Hex Bytes Category Named Entity Name
! \U00000021 000021 b'!' Po NO NAMED ENTITY EXCLAMATION MARK
" \U00000022 000022 b'"' Po quot QUOTATION MARK
# \U00000023 000023 b'#' Po NO NAMED ENTITY NUMBER SIGN
$ \U00000024 000024 b'$' Sc NO NAMED ENTITY DOLLAR SIGN
% \U00000025 000025 b'%' Po NO NAMED ENTITY PERCENT SIGN
& \U00000026 000026 b'&' Po amp AMPERSAND
' \U00000027 000027 b"'" Po NO NAMED ENTITY APOSTROPHE
( \U00000028 000028 b'(' Ps NO NAMED ENTITY LEFT PARENTHESIS
) \U00000029 000029 b')' Pe NO NAMED ENTITY RIGHT PARENTHESIS
* \U0000002A 00002A b'*' Po NO NAMED ENTITY ASTERISK
+ \U0000002B 00002B b'+' Sm NO NAMED ENTITY PLUS SIGN
, \U0000002C 00002C b',' Po NO NAMED ENTITY COMMA
- \U0000002D 00002D b'-' Pd NO NAMED ENTITY HYPHEN-MINUS
. \U0000002E 00002E b'.' Po NO NAMED ENTITY FULL STOP
/ \U0000002F 00002F b'/' Po NO NAMED ENTITY SOLIDUS
: \U0000003A 00003A b':' Po NO NAMED ENTITY COLON
; \U0000003B 00003B b';' Po NO NAMED ENTITY SEMICOLON
< \U0000003C 00003C b'<' Sm lt LESS-THAN SIGN
= \U0000003D 00003D b'=' Sm NO NAMED ENTITY EQUALS SIGN
> \U0000003E 00003E b'>' Sm gt GREATER-THAN SIGN
? \U0000003F 00003F b'?' Po NO NAMED ENTITY QUESTION MARK
@ \U00000040 000040 b'@' Po NO NAMED ENTITY COMMERCIAL AT
[ \U0000005B 00005B b'[' Ps NO NAMED ENTITY LEFT SQUARE BRACKET
\ \U0000005C 00005C b'\\' Po NO NAMED ENTITY REVERSE SOLIDUS
] \U0000005D 00005D b']' Pe NO NAMED ENTITY RIGHT SQUARE BRACKET
^ \U0000005E 00005E b'^' Sk NO NAMED ENTITY CIRCUMFLEX ACCENT
_ \U0000005F 00005F b'_' Pc NO NAMED ENTITY LOW LINE
` \U00000060 000060 b'`' Sk NO NAMED ENTITY GRAVE ACCENT
{ \U0000007B 00007B b'{' Ps NO NAMED ENTITY LEFT CURLY BRACKET
| \U0000007C 00007C b'|' Sm NO NAMED ENTITY VERTICAL LINE
} \U0000007D 00007D b'}' Pe NO NAMED ENTITY RIGHT CURLY BRACKET
~ \U0000007E 00007E b'~' Sm NO NAMED ENTITY TILDE
Whitespace#
print_code_point_information(string.whitespace[0])
1
Glyph Code Point Hex Bytes Category Named Entity Name
\U00000020 000020 b' ' Zs NO NAMED ENTITY SPACE
string.whitespace[1:]
'\t\n\r\x0b\x0c'
# print_code_point_information(string.printable)
# codec `ascii` only first 128
for i in range(256):
i = chr(i)
try:
print(f"{i:<10} {str(i.encode('ascii')):<10} {unicodedata.name(i)}")
except (UnicodeEncodeError, ValueError) as e:
print(e)
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
b' ' SPACE
! b'!' EXCLAMATION MARK
" b'"' QUOTATION MARK
# b'#' NUMBER SIGN
$ b'$' DOLLAR SIGN
% b'%' PERCENT SIGN
& b'&' AMPERSAND
' b"'" APOSTROPHE
( b'(' LEFT PARENTHESIS
) b')' RIGHT PARENTHESIS
* b'*' ASTERISK
+ b'+' PLUS SIGN
, b',' COMMA
- b'-' HYPHEN-MINUS
. b'.' FULL STOP
/ b'/' SOLIDUS
0 b'0' DIGIT ZERO
1 b'1' DIGIT ONE
2 b'2' DIGIT TWO
3 b'3' DIGIT THREE
4 b'4' DIGIT FOUR
5 b'5' DIGIT FIVE
6 b'6' DIGIT SIX
7 b'7' DIGIT SEVEN
8 b'8' DIGIT EIGHT
9 b'9' DIGIT NINE
: b':' COLON
; b';' SEMICOLON
< b'<' LESS-THAN SIGN
= b'=' EQUALS SIGN
> b'>' GREATER-THAN SIGN
? b'?' QUESTION MARK
@ b'@' COMMERCIAL AT
A b'A' LATIN CAPITAL LETTER A
B b'B' LATIN CAPITAL LETTER B
C b'C' LATIN CAPITAL LETTER C
D b'D' LATIN CAPITAL LETTER D
E b'E' LATIN CAPITAL LETTER E
F b'F' LATIN CAPITAL LETTER F
G b'G' LATIN CAPITAL LETTER G
H b'H' LATIN CAPITAL LETTER H
I b'I' LATIN CAPITAL LETTER I
J b'J' LATIN CAPITAL LETTER J
K b'K' LATIN CAPITAL LETTER K
L b'L' LATIN CAPITAL LETTER L
M b'M' LATIN CAPITAL LETTER M
N b'N' LATIN CAPITAL LETTER N
O b'O' LATIN CAPITAL LETTER O
P b'P' LATIN CAPITAL LETTER P
Q b'Q' LATIN CAPITAL LETTER Q
R b'R' LATIN CAPITAL LETTER R
S b'S' LATIN CAPITAL LETTER S
T b'T' LATIN CAPITAL LETTER T
U b'U' LATIN CAPITAL LETTER U
V b'V' LATIN CAPITAL LETTER V
W b'W' LATIN CAPITAL LETTER W
X b'X' LATIN CAPITAL LETTER X
Y b'Y' LATIN CAPITAL LETTER Y
Z b'Z' LATIN CAPITAL LETTER Z
[ b'[' LEFT SQUARE BRACKET
\ b'\\' REVERSE SOLIDUS
] b']' RIGHT SQUARE BRACKET
^ b'^' CIRCUMFLEX ACCENT
_ b'_' LOW LINE
` b'`' GRAVE ACCENT
a b'a' LATIN SMALL LETTER A
b b'b' LATIN SMALL LETTER B
c b'c' LATIN SMALL LETTER C
d b'd' LATIN SMALL LETTER D
e b'e' LATIN SMALL LETTER E
f b'f' LATIN SMALL LETTER F
g b'g' LATIN SMALL LETTER G
h b'h' LATIN SMALL LETTER H
i b'i' LATIN SMALL LETTER I
j b'j' LATIN SMALL LETTER J
k b'k' LATIN SMALL LETTER K
l b'l' LATIN SMALL LETTER L
m b'm' LATIN SMALL LETTER M
n b'n' LATIN SMALL LETTER N
o b'o' LATIN SMALL LETTER O
p b'p' LATIN SMALL LETTER P
q b'q' LATIN SMALL LETTER Q
r b'r' LATIN SMALL LETTER R
s b's' LATIN SMALL LETTER S
t b't' LATIN SMALL LETTER T
u b'u' LATIN SMALL LETTER U
v b'v' LATIN SMALL LETTER V
w b'w' LATIN SMALL LETTER W
x b'x' LATIN SMALL LETTER X
y b'y' LATIN SMALL LETTER Y
z b'z' LATIN SMALL LETTER Z
{ b'{' LEFT CURLY BRACKET
| b'|' VERTICAL LINE
} b'}' RIGHT CURLY BRACKET
~ b'~' TILDE
no such name
'ascii' codec can't encode character '\x80' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x81' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x82' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x83' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x84' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x85' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x86' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x87' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x88' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x89' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x8a' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x8b' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x8c' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x8d' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x8e' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x8f' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x90' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x91' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x92' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x93' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x94' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x95' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x96' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x97' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x98' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x99' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x9a' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x9b' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x9c' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x9d' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x9e' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\x9f' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa0' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa1' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa2' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa3' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa4' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa5' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa6' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa7' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa8' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xa9' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xaa' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xab' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xac' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xad' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xae' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xaf' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb0' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb1' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb2' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb3' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb4' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb5' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb6' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb7' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb8' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xb9' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xba' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xbb' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xbc' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xbd' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xbe' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xbf' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc0' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc1' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc2' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc3' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc4' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc5' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc6' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc7' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc8' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xc9' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xca' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xcb' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xcc' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xcd' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xce' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xcf' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd0' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd1' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd2' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd3' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd4' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd5' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd6' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd7' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd8' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xd9' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xda' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xdb' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xdc' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xdd' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xde' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xdf' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe0' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe1' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe2' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe3' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe4' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe5' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe6' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe7' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe8' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xe9' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xea' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xeb' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xec' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xed' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xee' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xef' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf0' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf1' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf2' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf3' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf4' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf5' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf6' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf7' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf8' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xf9' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xfa' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xfb' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xfc' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xfd' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xfe' in position 0: ordinal not in range(128)
'ascii' codec can't encode character '\xff' in position 0: ordinal not in range(128)
# ASCII requires no more than one byte of space.
all(len(chr(i).encode('ascii')) == 1 for i in range(128))
True
Extended ASCII#
Code points 0-255 are mapped to bytes 0x0-0xff.
# codec `latin-1` only first 256
for i in range(257):
i = chr(i)
try:
print(f"{i:<10} {str(i.encode('latin-1')):<10} {unicodedata.name(i)}")
except (UnicodeEncodeError, ValueError) as e:
print(e)
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
b' ' SPACE
! b'!' EXCLAMATION MARK
" b'"' QUOTATION MARK
# b'#' NUMBER SIGN
$ b'$' DOLLAR SIGN
% b'%' PERCENT SIGN
& b'&' AMPERSAND
' b"'" APOSTROPHE
( b'(' LEFT PARENTHESIS
) b')' RIGHT PARENTHESIS
* b'*' ASTERISK
+ b'+' PLUS SIGN
, b',' COMMA
- b'-' HYPHEN-MINUS
. b'.' FULL STOP
/ b'/' SOLIDUS
0 b'0' DIGIT ZERO
1 b'1' DIGIT ONE
2 b'2' DIGIT TWO
3 b'3' DIGIT THREE
4 b'4' DIGIT FOUR
5 b'5' DIGIT FIVE
6 b'6' DIGIT SIX
7 b'7' DIGIT SEVEN
8 b'8' DIGIT EIGHT
9 b'9' DIGIT NINE
: b':' COLON
; b';' SEMICOLON
< b'<' LESS-THAN SIGN
= b'=' EQUALS SIGN
> b'>' GREATER-THAN SIGN
? b'?' QUESTION MARK
@ b'@' COMMERCIAL AT
A b'A' LATIN CAPITAL LETTER A
B b'B' LATIN CAPITAL LETTER B
C b'C' LATIN CAPITAL LETTER C
D b'D' LATIN CAPITAL LETTER D
E b'E' LATIN CAPITAL LETTER E
F b'F' LATIN CAPITAL LETTER F
G b'G' LATIN CAPITAL LETTER G
H b'H' LATIN CAPITAL LETTER H
I b'I' LATIN CAPITAL LETTER I
J b'J' LATIN CAPITAL LETTER J
K b'K' LATIN CAPITAL LETTER K
L b'L' LATIN CAPITAL LETTER L
M b'M' LATIN CAPITAL LETTER M
N b'N' LATIN CAPITAL LETTER N
O b'O' LATIN CAPITAL LETTER O
P b'P' LATIN CAPITAL LETTER P
Q b'Q' LATIN CAPITAL LETTER Q
R b'R' LATIN CAPITAL LETTER R
S b'S' LATIN CAPITAL LETTER S
T b'T' LATIN CAPITAL LETTER T
U b'U' LATIN CAPITAL LETTER U
V b'V' LATIN CAPITAL LETTER V
W b'W' LATIN CAPITAL LETTER W
X b'X' LATIN CAPITAL LETTER X
Y b'Y' LATIN CAPITAL LETTER Y
Z b'Z' LATIN CAPITAL LETTER Z
[ b'[' LEFT SQUARE BRACKET
\ b'\\' REVERSE SOLIDUS
] b']' RIGHT SQUARE BRACKET
^ b'^' CIRCUMFLEX ACCENT
_ b'_' LOW LINE
` b'`' GRAVE ACCENT
a b'a' LATIN SMALL LETTER A
b b'b' LATIN SMALL LETTER B
c b'c' LATIN SMALL LETTER C
d b'd' LATIN SMALL LETTER D
e b'e' LATIN SMALL LETTER E
f b'f' LATIN SMALL LETTER F
g b'g' LATIN SMALL LETTER G
h b'h' LATIN SMALL LETTER H
i b'i' LATIN SMALL LETTER I
j b'j' LATIN SMALL LETTER J
k b'k' LATIN SMALL LETTER K
l b'l' LATIN SMALL LETTER L
m b'm' LATIN SMALL LETTER M
n b'n' LATIN SMALL LETTER N
o b'o' LATIN SMALL LETTER O
p b'p' LATIN SMALL LETTER P
q b'q' LATIN SMALL LETTER Q
r b'r' LATIN SMALL LETTER R
s b's' LATIN SMALL LETTER S
t b't' LATIN SMALL LETTER T
u b'u' LATIN SMALL LETTER U
v b'v' LATIN SMALL LETTER V
w b'w' LATIN SMALL LETTER W
x b'x' LATIN SMALL LETTER X
y b'y' LATIN SMALL LETTER Y
z b'z' LATIN SMALL LETTER Z
{ b'{' LEFT CURLY BRACKET
| b'|' VERTICAL LINE
} b'}' RIGHT CURLY BRACKET
~ b'~' TILDE
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
no such name
b'\xa0' NO-BREAK SPACE
¡ b'\xa1' INVERTED EXCLAMATION MARK
¢ b'\xa2' CENT SIGN
£ b'\xa3' POUND SIGN
¤ b'\xa4' CURRENCY SIGN
¥ b'\xa5' YEN SIGN
¦ b'\xa6' BROKEN BAR
§ b'\xa7' SECTION SIGN
¨ b'\xa8' DIAERESIS
© b'\xa9' COPYRIGHT SIGN
ª b'\xaa' FEMININE ORDINAL INDICATOR
« b'\xab' LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
¬ b'\xac' NOT SIGN
b'\xad' SOFT HYPHEN
® b'\xae' REGISTERED SIGN
¯ b'\xaf' MACRON
° b'\xb0' DEGREE SIGN
± b'\xb1' PLUS-MINUS SIGN
² b'\xb2' SUPERSCRIPT TWO
³ b'\xb3' SUPERSCRIPT THREE
´ b'\xb4' ACUTE ACCENT
µ b'\xb5' MICRO SIGN
¶ b'\xb6' PILCROW SIGN
· b'\xb7' MIDDLE DOT
¸ b'\xb8' CEDILLA
¹ b'\xb9' SUPERSCRIPT ONE
º b'\xba' MASCULINE ORDINAL INDICATOR
» b'\xbb' RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
¼ b'\xbc' VULGAR FRACTION ONE QUARTER
½ b'\xbd' VULGAR FRACTION ONE HALF
¾ b'\xbe' VULGAR FRACTION THREE QUARTERS
¿ b'\xbf' INVERTED QUESTION MARK
À b'\xc0' LATIN CAPITAL LETTER A WITH GRAVE
Á b'\xc1' LATIN CAPITAL LETTER A WITH ACUTE
 b'\xc2' LATIN CAPITAL LETTER A WITH CIRCUMFLEX
à b'\xc3' LATIN CAPITAL LETTER A WITH TILDE
Ä b'\xc4' LATIN CAPITAL LETTER A WITH DIAERESIS
Å b'\xc5' LATIN CAPITAL LETTER A WITH RING ABOVE
Æ b'\xc6' LATIN CAPITAL LETTER AE
Ç b'\xc7' LATIN CAPITAL LETTER C WITH CEDILLA
È b'\xc8' LATIN CAPITAL LETTER E WITH GRAVE
É b'\xc9' LATIN CAPITAL LETTER E WITH ACUTE
Ê b'\xca' LATIN CAPITAL LETTER E WITH CIRCUMFLEX
Ë b'\xcb' LATIN CAPITAL LETTER E WITH DIAERESIS
Ì b'\xcc' LATIN CAPITAL LETTER I WITH GRAVE
Í b'\xcd' LATIN CAPITAL LETTER I WITH ACUTE
Î b'\xce' LATIN CAPITAL LETTER I WITH CIRCUMFLEX
Ï b'\xcf' LATIN CAPITAL LETTER I WITH DIAERESIS
Ð b'\xd0' LATIN CAPITAL LETTER ETH
Ñ b'\xd1' LATIN CAPITAL LETTER N WITH TILDE
Ò b'\xd2' LATIN CAPITAL LETTER O WITH GRAVE
Ó b'\xd3' LATIN CAPITAL LETTER O WITH ACUTE
Ô b'\xd4' LATIN CAPITAL LETTER O WITH CIRCUMFLEX
Õ b'\xd5' LATIN CAPITAL LETTER O WITH TILDE
Ö b'\xd6' LATIN CAPITAL LETTER O WITH DIAERESIS
× b'\xd7' MULTIPLICATION SIGN
Ø b'\xd8' LATIN CAPITAL LETTER O WITH STROKE
Ù b'\xd9' LATIN CAPITAL LETTER U WITH GRAVE
Ú b'\xda' LATIN CAPITAL LETTER U WITH ACUTE
Û b'\xdb' LATIN CAPITAL LETTER U WITH CIRCUMFLEX
Ü b'\xdc' LATIN CAPITAL LETTER U WITH DIAERESIS
Ý b'\xdd' LATIN CAPITAL LETTER Y WITH ACUTE
Þ b'\xde' LATIN CAPITAL LETTER THORN
ß b'\xdf' LATIN SMALL LETTER SHARP S
à b'\xe0' LATIN SMALL LETTER A WITH GRAVE
á b'\xe1' LATIN SMALL LETTER A WITH ACUTE
â b'\xe2' LATIN SMALL LETTER A WITH CIRCUMFLEX
ã b'\xe3' LATIN SMALL LETTER A WITH TILDE
ä b'\xe4' LATIN SMALL LETTER A WITH DIAERESIS
å b'\xe5' LATIN SMALL LETTER A WITH RING ABOVE
æ b'\xe6' LATIN SMALL LETTER AE
ç b'\xe7' LATIN SMALL LETTER C WITH CEDILLA
è b'\xe8' LATIN SMALL LETTER E WITH GRAVE
é b'\xe9' LATIN SMALL LETTER E WITH ACUTE
ê b'\xea' LATIN SMALL LETTER E WITH CIRCUMFLEX
ë b'\xeb' LATIN SMALL LETTER E WITH DIAERESIS
ì b'\xec' LATIN SMALL LETTER I WITH GRAVE
í b'\xed' LATIN SMALL LETTER I WITH ACUTE
î b'\xee' LATIN SMALL LETTER I WITH CIRCUMFLEX
ï b'\xef' LATIN SMALL LETTER I WITH DIAERESIS
ð b'\xf0' LATIN SMALL LETTER ETH
ñ b'\xf1' LATIN SMALL LETTER N WITH TILDE
ò b'\xf2' LATIN SMALL LETTER O WITH GRAVE
ó b'\xf3' LATIN SMALL LETTER O WITH ACUTE
ô b'\xf4' LATIN SMALL LETTER O WITH CIRCUMFLEX
õ b'\xf5' LATIN SMALL LETTER O WITH TILDE
ö b'\xf6' LATIN SMALL LETTER O WITH DIAERESIS
÷ b'\xf7' DIVISION SIGN
ø b'\xf8' LATIN SMALL LETTER O WITH STROKE
ù b'\xf9' LATIN SMALL LETTER U WITH GRAVE
ú b'\xfa' LATIN SMALL LETTER U WITH ACUTE
û b'\xfb' LATIN SMALL LETTER U WITH CIRCUMFLEX
ü b'\xfc' LATIN SMALL LETTER U WITH DIAERESIS
ý b'\xfd' LATIN SMALL LETTER Y WITH ACUTE
þ b'\xfe' LATIN SMALL LETTER THORN
ÿ b'\xff' LATIN SMALL LETTER Y WITH DIAERESIS
'latin-1' codec can't encode character '\u0100' in position 0: ordinal not in range(256)
# Extended ASCII requires no more than one byte of space.
all(len(chr(i).encode('latin-1')) == 1 for i in range(256))
True
UTF-8#
8-bit encoding: this means that there are no issues with byte order and no BOM is required
each byte consists of two parts
marker bits (most significant bits): a sequence of zero to four
1
bits followed by a0
bitpayload bits
the LSB of the Unicode character is the rightmost x bit
Range |
Encoding |
---|---|
|
0xxxxxxx |
|
110xxxxx 10xxxxxx |
|
1110xxxx 10xxxxxx 10xxxxxx |
|
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
File Formats#
File Type |
Magic Number (hex) |
Magic Number (ASCII) |
File Offset [bytes] |
File Name Extension |
---|---|---|---|---|
DOS executable |
4D 5A |
MZ |
0 |
|
ELF |
7F 45 4C 46 |
\x7fELF (␡ELF) |
0 |
|
GIF |
GIF87a |
|
||
. |
GIF89a |
|
||
HDF |
\211HDF\r\n\032\n |
|
||
Java Class |
CA FE BA BE |
Êþº¾ |
|
|
JAR |
50 4B 03 04 |
PK\x03\x04 |
|
|
JPEG |
FF D8 FF DB |
ÿØÿÛ |
0 |
|
Linux/Unix Script |
23 21 |
#! |
|
|
MIDI |
4D 54 68 64 |
MThd (“MIDI Track Header”) |
||
25 50 44 46 |
|
|||
PNG |
89 50 4E 47 0D 0A 1A 0A |
\x89PNG\r\n\x1a\n (‰PNG␍␊␚␊) |
|
|
PS |
25 21 (50 53) |
%!(PS) |
|
|
TIFF (Intel little end) |
49 49 2A 00 |
II* |
|
|
TIFF (Motorola big end) |
4D 4D 00 2A |
MM* |
||
XML |
<?xml |
|
||
Zip |
50 4B 03 04 |
PK\x03\x04 |
0 |
|
. |
50 4B 05 06 |
|||
. |
50 4B 07 08 |
pad=30
print(f"{'DOS' :<{pad}} {''.join([chr(c) for c in [0x4d, 0x5a]])}")
print(f"{'ELF' :<{pad}} {''.join([chr(c) for c in [0x7f, 0x45, 0x4c, 0x46]])}")
print(f"{'Java Archive' :<{pad}} {''.join([chr(c) for c in [0x50, 0x4b, 0x03, 0x04]])}")
print(f"{'Java Class' :<{pad}} {''.join([chr(c) for c in [0xca, 0xfe, 0xba, 0xbe]])}")
print(f"{'JPEG' :<{pad}} {''.join([chr(c) for c in [0xff, 0xd8, 0xff, 0xdb]])}")
print(f"{'Linux/Unix Script' :<{pad}} {''.join([chr(c) for c in [0x23, 0x21]])}")
print(f"{'MIDI' :<{pad}} {''.join([chr(c) for c in [0x4D, 0x54, 0x68, 0x64]])}")
print(f"{'PDF' :<{pad}} {''.join([chr(c) for c in [0x25, 0x50, 0x44, 0x46]])}")
print(f"{'PNG' :<{pad}} {''.join([chr(c) for c in [0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]])}")
print(f"{'PS' :<{pad}} {''.join([chr(c) for c in [0x25, 0x21, 0x50, 0x53]])}")
print(f"{'TIFF (Intel little end)':<{pad}} {''.join([chr(c) for c in [0x49, 0x49, 0x2A, 0x00]])}")
print(f"{'TIFF (Motorola big end)':<{pad}} {''.join([chr(c) for c in [0x4D, 0x4D, 0x00, 0x2A]])}")
print(f"{'Zip' :<{pad}} {''.join([chr(c) for c in [0x50, 0x4b, 0x03, 0x04]])}")
DOS MZ ELF ELF Java Archive PK Java Class Êþº¾ JPEG ÿØÿÛ Linux/Unix Script #! MIDI MThd PDF %PDF PNG PNG PS %!PS TIFF (Intel little end) II*TIFF (Motorola big end) MM* Zip PK
Figures#
Resources#
Named Character References HTML
[ h ][ w ] Bitmap (BMP)
.bmp
[ h ][ w ] Comma-Separated Values (CSV)
.csv
[ h ][ w ] DOS MZ “Mark Zibkowski” Executable
.exe
[ h ][ w ] Extensible Markup Language (XML)
.xml
[ h ][ w ] Graphics Interchange Format (GIF)
.gif
[ h ][ w ] Hierarchical Data Foramt (HDF)
.hd5
,.hdf5
[ h ][ w ] Joint Photographic Experts Group (JPEG)
.jpg
,.jpeg
[ h ][ w ] Java Archive File (JAR)
.jar
[ h ][ w ] MP3
[ h ][ w ] Musical Instrument Digital Interface (MIDI)
[ h ][ w ] Portable Document Format (PDF)
.pdf
[ h ][ w ] Portable Network Graphics (PNG)
.png
[ h ][ w ] Scalable Vector Graphics (SVG)
.svg
[ h ][ w ] Tab-Separated Values (TSV)
.tsv
[ h ][ w ] Tag(ged) Image File Format (TIFF)
[ h ][ w ] Waveform Audio File Format (WAV)
.wav
,.wave
Terms#
[ w ] .exe
[ w ] ANSI Escape Sequences
[ w ] Archive File
[ w ] list of archive formats
[ w ] American Standard Code for Information Interchange (ASCII)
[ w ] Audio File Format
[ w ] Big Endian
[ w ] Binary-Coded Decimal Interchange Code (BCDIC)
[ w ] Binary-to-Text Encoding
[ w ] Binary Code
[ w ] Binary File
[ w ] Bit Numbering
[ w ] Bit String
[ w ] Byte
[ w ] Byte Order Mark (BOM)
[ w ] C0 & C1 Control Codes
[ w ] Caps Lock
[ w ] Caret Notation
[ w ] Character
[ w ] Character Encoding
[ w ] Character Entities
[ w ] Code
[ w ] Code Page
[ w ] Code Point
[ w ] Codec
[ w ] Combining Character
[ w ] Complex Text Layout (CTL)
[ w ] Container Format
[ w ] Control Character
[ w ] Control Picture
[ w ] Ctrl-Alt-Del
[ w ] Ctrl-C
[ w ] Ctrl-D
[ w ] Ctrl-Z
[ w ] Data Compression
[ w ] Data Compression Ratio
[ w ] Data Conversion
[ w ] Data File
[ w ] Deflate
[ w ] Diacritic
[ w ] Diaeresis
[ w ] Dictionary Coder
[ w ] Disk Image
[ w ] Document Template
[ w ] Electronic Data Interchange (EDI)
[ w ] End of File (EOF)
[ w ] End of Line (EOL)
[ w ] Endianness
[ w ] Enriched Text
[ w ] Escape Character
[ w ] Escape Sequence
[ w ] Escape Sequence in C
[ w ] Executable File
[ w ] list of executable file formats
[ w ] Executable and Linkable Format (ELF)
[ w ] Executable Compression
[ w ] Extended Binary Coded Decimal Interchange Code (EBCDIC)
[ w ] Extended ASCII
[ w ] File Archiver
[ w ] File Archivers
[ w ] File Format
[ w ] list of file formats
[ w ] File Name Extension
[ w ] Grapheme
[ w ] Grave Accent
[ w ] Graphics Interchange Format (GIF)
[ w ] Guillemet
[ w ] gzip
[ w ] Hexadecimal
[ w ] Huffman Coding
[ w ] Image File Format
[ w ] list of image file formats
[ w ] Interchange File Format (IFF)
[ w ] International Phonetic Alphabet (IPA)
[ w ] Internationalization and Localization
[ w ] Java Class File
[ w ] Language-Independent Specification (LIS)
[ w ] Lempel-Ziv-Oberhumer (LZO)
[ w ] Letter Case
[ w ] Ligature
[ w ] Line
[ w ] Linear Predictive Coding (LPC)
[ w ] Little Endian
[ w ] Lossless Compression
[ w ] Lossy Compression
[ w ] Lower Case (Miniscule)
[ w ] LZ77 LZ78
[ w ] Magic Number
[ w ] Manifest File
[ w ] Metacharacter
[ w ] Mojibake
[ w ] Named Character Reference
[ w ] Newline
[ w ] Nibble
[ w ] Null-Terminated String
[ w ] Number Sign
[ w ] Numeric Character Reference
[ w ] Object File
[ w ] Octet
[ w ] Open File Format
[ w ] OpenDocument
[ w ] OpenType
[ w ] Page Break
[ w ] pax
[ w ] Percent Encoding
[ w ] Plain Text
[ w ] Plane
[ w ] PostScript (PS)
[ w ] Pound Sign
[ w ] Precomposed Character
[ w ] Punctuation
[ w ] Raster Graphics
[ w ] Rich Text
[ w ] Ring
[ w ] Run-Length Encoding (RLE)
[ w ] Self-Synchronizing Code
[ w ] Serialization
[ w ] Shebang
[ w ] Simple Data Format (SDF)
[ w ] Software Flow Control
[ w ] Specials
[ w ] String
[ w ] String Literal
[ w ] Tab-Separated Values
[ w ] Tab Stop
[ w ] tar
[ w ] Text Normalization
[ w ] Touch Typing
[ w ] Typeface
[ w ] Unicode
[ w ] Unicode Block
[ w ] Unicode Character Property
[ w ] Unicode Collation Algorithm
[ w ] Unicode Consortium
[ w ] Unicode Equivalence
[ w ] Universal Character Set (UCS) characters
[ w ] Universal Coded Character Set (UCS)
[ w ] Upper Case (Majuscule)
[ w ] URL Encoding
[ w ] UTF-16
[ w ] UTF-8
[ w ] Variable-Width Encoding
[ w ] Video File Format
[ w ] Whitespace
[ w ] Word
[ w ] Writing System
[ w ] Zip
[ w ] Zlib
[ w ] Zstd