VapourSynth-llvmexpr
Loading...
Searching...
No Matches
Tokenizer.cpp
Go to the documentation of this file.
1
19
20#include "Tokenizer.hpp"
21#include <algorithm>
22#include <cctype>
23
24namespace infix2postfix {
25
26namespace {
27
28std::map<std::string, TokenType> build_keywords_map() {
29 std::map<std::string, TokenType> map;
30 for (const auto& mapping : TOKEN_MAPPINGS) {
31 if (mapping.str.length() > 1 && (std::isalpha(mapping.str[0]) != 0)) {
32 map.emplace(mapping.str, mapping.type);
33 }
34 }
35 return map;
36}
37
38using OpMap = std::map<char, std::vector<TokenMapping>>;
39
40OpMap build_operator_map() {
41 OpMap map;
42 for (const auto& mapping : TOKEN_MAPPINGS) {
43 if (!mapping.str.empty() && (std::isalpha(mapping.str[0]) == 0)) {
44 map[mapping.str[0]].push_back(mapping);
45 }
46 }
47
48 // Sort by length descending for greedy matching
49 for (auto& pair : map) {
50 std::ranges::sort(pair.second,
51 [](const TokenMapping& a, const TokenMapping& b) {
52 return a.str.length() > b.str.length();
53 });
54 }
55 return map;
56}
57
58} // namespace
59
60const std::map<std::string, TokenType> Tokenizer::keywords =
61 build_keywords_map();
62static const OpMap operator_map = build_operator_map();
63
64Tokenizer::Tokenizer(std::string source) : source(std::move(source)) {}
65
66std::vector<Token> Tokenizer::tokenize() {
67 std::vector<Token> tokens;
68 while (peek() != '\0') {
69 tokens.push_back(nextToken());
70 }
71 tokens.push_back(makeToken(TokenType::EndOfFile));
72 return tokens;
73}
74
75Token Tokenizer::nextToken() {
76 skipWhitespaceAndComments();
77 start = current;
78 start_line = line;
79 start_column = column;
80 if (peek() == '\0') {
81 return makeToken(TokenType::EndOfFile);
82 }
83
84 char c = peek();
85
86 if (c == '\n') {
87 advance();
88 return makeToken(TokenType::Newline);
89 }
90
91 if ((std::isalpha(c) != 0) || c == '_') {
92 return identifier();
93 }
94 if (std::isdigit(c) != 0) {
95 return number();
96 }
97 if (c == '$') {
98 if ((std::isalpha(peek(1)) != 0) || peek(1) == '_') {
99 advance(); // '$'
100 while ((std::isalnum(peek()) != 0) || peek() == '_') {
101 advance();
102 }
103 return makeToken(TokenType::Identifier);
104 }
105 }
106
107 if (c == '<') {
108 if (source.substr(current, std::string("<global").length()) ==
109 "<global") {
110 return globalDeclaration();
111 }
112 }
113
114 if (operator_map.contains(c)) {
115 const auto& possible_tokens = operator_map.at(c);
116 auto it =
117 std::ranges::find_if(possible_tokens, [&](const auto& mapping) {
118 return source.substr(current, mapping.str.length()) ==
119 mapping.str;
120 });
121 if (it != possible_tokens.end()) {
122 current += it->str.length();
123 return makeToken(it->type);
124 }
125 }
126
127 advance();
128 return makeToken(TokenType::Invalid, std::string(1, c));
129}
130
131void Tokenizer::skipWhitespaceAndComments() {
132 while (true) {
133 char c = peek();
134 switch (c) {
135 case ' ':
136 case '\r':
137 case '\t':
138 advance();
139 break;
140 case '#':
141 while (peek() != '\n' && peek() != '\0') {
142 advance();
143 }
144 break;
145 default:
146 return;
147 }
148 }
149}
150
151char Tokenizer::peek(int offset) const {
152 if (current + offset >= source.length()) {
153 return '\0';
154 }
155 return source[current + offset];
156}
157
158char Tokenizer::advance() {
159 if (current < source.length()) {
160 if (source[current] == '\n') {
161 line++;
162 column = 1;
163 } else {
164 column++;
165 }
166 current++;
167 }
168 return source[current - 1];
169}
170
171Token Tokenizer::makeToken(TokenType type, const std::string& value) const {
172 Range range;
173 range.start.line = start_line;
174 range.start.column = start_column;
175 range.end.line = line;
176 range.end.column =
177 column - 1; // column is already advanced past the last character
178
179 return {.type = type,
180 .value =
181 value.empty() ? source.substr(start, current - start) : value,
182 .range = range};
183}
184
185Token Tokenizer::identifier() {
186 while ((std::isalnum(peek()) != 0) || peek() == '_') {
187 advance();
188 }
189 std::string text = source.substr(start, current - start);
190 auto it = keywords.find(text);
191 if (it != keywords.end()) {
192 return makeToken(it->second);
193 }
194 return makeToken(TokenType::Identifier);
195}
196
197Token Tokenizer::number() {
198 bool is_hex = false;
199 if (peek() == '0' && (peek(1) == 'x' || peek(1) == 'X')) {
200 is_hex = true;
201 advance(); // 0
202 advance(); // x
203 }
204
205 while ((std::isdigit(peek()) != 0) ||
206 (is_hex && (std::isxdigit(peek()) != 0))) {
207 advance();
208 }
209
210 if (peek() == '.' && (std::isdigit(peek(1)) != 0)) {
211 advance(); // '.'
212 while (std::isdigit(peek()) != 0) {
213 advance();
214 }
215 }
216
217 if (is_hex && (peek() == 'p' || peek() == 'P')) {
218 advance(); // 'p'
219 if (peek() == '+' || peek() == '-') {
220 advance(); // sign
221 }
222 while (std::isdigit(peek()) != 0) {
223 advance();
224 }
225 } else if (!is_hex && (peek() == 'e' || peek() == 'E')) {
226 advance(); // 'e'
227 if (peek() == '+' || peek() == '-') {
228 advance(); // sign
229 }
230 while (std::isdigit(peek()) != 0) {
231 advance();
232 }
233 }
234
235 return makeToken(TokenType::Number);
236}
237
238Token Tokenizer::globalDeclaration() {
239 advance(); // Consume initial '<'
240 int depth = 1; // We've already seen the opening '<'
241 while (depth > 0 && peek() != '\0') {
242 char c = peek();
243 if (c == '<') {
244 depth++;
245 } else if (c == '>') {
246 depth--;
247 }
248 advance();
249 }
250
251 if (depth == 0) {
252 return makeToken(TokenType::Global);
253 }
254 return makeToken(TokenType::Invalid, source.substr(start, current - start));
255}
256
257} // namespace infix2postfix
std::vector< Token > tokenize()
Definition Tokenizer.cpp:66
Tokenizer(std::string source)
Definition Tokenizer.cpp:64
constexpr std::array TOKEN_MAPPINGS
Definition types.hpp:126