generated from okikio/transferables
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconstants.ts
129 lines (116 loc) · 4.17 KB
/
constants.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
/**
* @module
* This module defines constants used for UTF-8 character encoding,
* covering 1-byte to 5-byte sequences, including their leading bits
* and masks for identifying and extracting the encoded character bits.
*
* Defines constants for UTF-8 encoding operations, including lead bytes, masks, and bits required for different byte sequences.
* These constants are essential for encoding and decoding UTF-8 characters, from simple ASCII to complex symbols and emojis.
*
* @example
* Imagine encoding the character '𝄞' (the G Clef symbol in music), which requires a 4-byte UTF-8 sequence.
*
* 1. Identify the lead byte for a 4-byte sequence: `LEAD_FOR_4B` (1111 0000 in binary)
* 2. The mask for extracting significant bits from the first byte in a 4-byte sequence: `MASK_FOR_4B` (0000 0111 in binary)
* 3. To encode '𝄞', we calculate its bits beyond the ASCII range, requiring `BITS_FOR_4B` (18 bits for the highest bits 19 -> 21).
*
* The process involves:
* - Using `LEAD_FOR_4B` to start the encoding sequence.
* - Applying `MASK_FOR_4B` to extract the first few significant bits of the character.
* - Shifting by `BITS_FOR_4B`, `BITS_FOR_3B`, and `BITS_FOR_2B` to position the remaining bits correctly.
*
* For a 2-byte character like 'Ω' (Omega):
* - Start with `LEAD_FOR_2B` (1100 0000 in binary) to indicate a 2-byte sequence.
* - Use `MASK_FOR_2B` (0001 1111 in binary) for the first byte's significant bits.
* - The shift amount is `BITS_FOR_2B` (6 bits for positions 7 to 12).
*
* A 1-byte ASCII character, such as 'A':
* - Simply uses `LEAD_FOR_1B` (1000 0000 in binary) and `MASK_FOR_1B` (0011 1111 in binary) to represent the character in UTF-8.
*/
// 1-byte encoding
/**
* Leading bits for a 1-byte sequence in UTF-8 encoding.
* This indicates that the character is represented with a single byte.
*
* @example `1000 0000`
*/
export const LEAD_FOR_1B = 0x80; // 1000 0000
/**
* Mask for extracting the significant bits from a 1-byte encoded character.
*
* @example `0011 1111`
*/
export const MASK_FOR_1B = 0x3F; // 0011 1111
// 2-byte encoding
/**
* Number of significant bits in a 2-byte sequence, used for characters beyond the ASCII range.
*
* @example highest bits 7 -> 12
*/
export const BITS_FOR_2B = 6; // highest bits 7 -> 12
/**
* Leading bits for a 2-byte sequence, indicating the start of a 2-byte encoded character.
*
* @example `1100 0000`
*/
export const LEAD_FOR_2B = 0xC0; // 1100 0000
/**
* Mask for extracting the significant bits from a 2-byte encoded character.
*
* @example `0001 1111`
*/
export const MASK_FOR_2B = 0x1F; // 0001 1111
// 3-byte encoding
/**
* Number of significant bits in a 3-byte sequence, typically used for characters in many non-Western alphabets.
*
* @example highest bits 13 -> 18
*/
export const BITS_FOR_3B = 12; // highest bits 13 -> 18
/**
* Leading bits for a 3-byte sequence, indicating the start of a 3-byte encoded character.
*
* @example `1110 0000`
*/
export const LEAD_FOR_3B = 0xE0; // 1110 0000
/**
* Mask for extracting the significant bits from a 3-byte encoded character.
*
* @example `0000 1111`
*/
export const MASK_FOR_3B = 0x0F; // 0000 1111
// 4-byte encoding
/**
* Number of significant bits in a 4-byte sequence, used for characters that are less common in daily use.
*
* @example highest bits 19 -> 21
*/
export const BITS_FOR_4B = 18; // highest bits 19 -> 21
/**
* Leading bits for a 4-byte sequence, indicating the start of a 4-byte encoded character.
*
* @example `1111 0000`
*/
export const LEAD_FOR_4B = 0xF0; // 1111 0000
/**
* Mask for extracting the significant bits from a 4-byte encoded character.
*
* @example `0000 0111`
*/
export const MASK_FOR_4B = 0x07; // 0000 0111
// 5-byte encoding
/**
* Leading bits for a 5-byte sequence. This is not officially used in UTF-8 encoding
* and is included for completeness.
*
* @example `1111 1000`
*/
export const LEAD_FOR_5B = 0xF8; // 1111 1000
// UTF-8 encoding specifics
/**
* The maximum number of bytes required to represent any UTF-8 character.
* This constant defines the upper limit for UTF-8 encoded character size.
*
* @example 4
*/
export const UTF8_MAX_BYTE_LENGTH = 4;