libmya 0.1.0
Library to parse Mya language.
Loading...
Searching...
No Matches
lexer.c
Go to the documentation of this file.
1#include <ctype.h>
2#include <stdbool.h>
3#include <stdio.h>
4#include <stdlib.h>
5#include <string.h>
6
7#include "debug.h"
8#include "dstring.h"
9#include "module.h"
10#include "queue.h"
11#include "token.h"
12#include "types/err.h"
13#include "types/keywords.h"
14#include "types/operators.h"
15
16
17static inline void
18_mod_rm_line(module_t* module);
19
20static inline unsigned int
21_mod_read_operator(module_t* module, unsigned int line, unsigned int column);
22
23static inline unsigned int
24_mod_read_number(module_t* module, unsigned int line, unsigned int column);
25
26static inline unsigned int
27_mod_read_string(module_t* module, unsigned int line, unsigned int column);
28
29static inline unsigned int
30_mod_read_identifier(module_t* module, unsigned int line, unsigned int column);
31
32static int
33_get_keyword(const char* lexeme);
34
35static int
36_get_operator(const char* lexeme);
37
38static int
39_char_escape(int ch);
40
41static inline unsigned int
42_char2base(int ch);
43
44static inline bool
45_isbasedigit(unsigned int base, int ch);
46
47#define MOD_ADD(lexeme, type) \
48 token = module_add_token(module); \
49 token_init(token, lexeme, type, line, column); \
50 DPRINTF3("Added token `%s` at %s:%d:%d.\n", lexeme, module->filepath, line, column)
51
54{
55 char message[128];
56 int ch;
57 unsigned int line = 1;
58 unsigned int column = 1;
60
61 for (; module_lookup(module, &ch, 0) == ERR_OK; column++) {
62 if (isblank(ch)) {
63 module_getc(module, &ch);
64 continue;
65 }
66
67 switch (ch) {
68 case '#':
69 _mod_rm_line(module);
70 line++;
71 column = 0;
72 continue;
73 case '\n':
74 line++;
75 column = 0;
76 break;
77 case '{':
79 break;
80 case '}':
82 break;
83 case '(':
85 break;
86 case ')':
88 break;
89 case '[':
91 break;
92 case ']':
94 break;
95 case ';':
97 break;
98 case ':':
99 MOD_ADD(":", TK_COLON);
100 break;
101 case ',':
102 MOD_ADD(",", TK_COMMA);
103 break;
104 case '=':
105 MOD_ADD("=", TK_EQUAL);
106 break;
107 case '"':
108 column += _mod_read_string(module, line, column) - 1;
109 continue;
110 default:
111 if (isdigit(ch)) {
112 column += _mod_read_number(module, line, column) - 1;
113 continue;
114 }
115
116 if (ispunct(ch)) {
117 column += _mod_read_operator(module, line, column) - 1;
118 continue;
119 }
120
121 if (isalnum(ch)) {
122 column += _mod_read_identifier(module, line, column) - 1;
123 continue;
124 }
125
126 sprintf(message, "Character '%c' is unexpected here!\n", ch);
127
128 module_add_error(module, line, column, 1, message);
129 break;
130 }
131
132 module_getc(module, &ch);
133 }
134
135 MOD_ADD(":EOF:", TK_EOF);
136
137 return ERR_OK;
138}
139
140static inline void
141_mod_rm_line(module_t* module)
142{
143 error_code_t err;
144 int ch;
145
146 do {
147 err = module_getc(module, &ch);
148 } while (err == ERR_OK && ch != '\n' && ch != EOF);
149}
150
151static inline unsigned int
152_mod_read_operator(module_t* module, unsigned int line, unsigned int column)
153{
154 char message[128];
155 int ch;
156 dstring_t* lexeme;
157
159 token_init(token, "", TK_OPERATOR, line, column);
160 lexeme = &token->lexeme;
161
162 module_lookup(module, &ch, 0);
163 while (ispunct(ch)) {
164 dstring_putchar(lexeme, ch);
165 module_getc(module, &ch);
166
167 module_lookup(module, &ch, 0);
168 }
169
170 int op = _get_operator(lexeme->data);
171 if (op < 0) {
172 sprintf(message, "Operator `%s` is invalid!", lexeme->data);
173
174 module_add_error(module, line, column, lexeme->length, message);
175 }
176
177 token->value = op;
178
179 return lexeme->length;
180}
181
182static inline unsigned int
183_mod_read_number(module_t* module, unsigned int line, unsigned int column)
184{
185 char message[128];
186 int firstchar;
187 int secondchar;
188 dstring_t* lexeme;
189 unsigned int base = 10;
190
192 token_init(token, "", TK_NUMBER, line, column);
193 lexeme = &token->lexeme;
194
195 module_getc(module, &firstchar);
196 module_lookup(module, &secondchar, 0);
197
198 if (! isalnum(secondchar)) {
199 dstring_putchar(lexeme, firstchar);
200 token->value = firstchar - '0';
201 DPRINTF3("Added token `%s` at %s:%d:%d.\n", lexeme->data, module->filepath, line, column);
202 return 1;
203 }
204
205 if (firstchar == '0') {
206 base = _char2base(secondchar);
207
208 if (base == 0) {
209 dstring_putchar(lexeme, firstchar);
210 token->value = firstchar - '0';
211 DPRINTF3("Added token `%s` at %s:%d:%d.\n", lexeme->data, module->filepath, line, column);
212
213 sprintf(
214 message,
215 "Character '%c' is an invalid base indicator. It should be 'x' (hexadecimal), 'b' (binary) or 'o' (octal).",
216 secondchar
217 );
218
219 module_add_error(module, line, column + 1, 1, message);
220 return 1;
221 }
222 }
223
224 if (base == 10) {
225 dstring_putchar(lexeme, firstchar);
226 } else {
227 module_getc(module, &secondchar);
228 dstring_putchar(lexeme, firstchar);
229 dstring_putchar(lexeme, secondchar);
230 }
231
232 module_lookup(module, &firstchar, 0);
233 while (_isbasedigit(base, firstchar)) {
234 dstring_putchar(lexeme, firstchar);
235
236 module_getc(module, &secondchar);
237 module_lookup(module, &firstchar, 0);
238 }
239
240 if (isalnum(firstchar)) {
241 sprintf(message, "Character '%c' is invalid for base %d literal number.", firstchar, base);
242
243 module_add_error(module, line, column + lexeme->length, 1, message);
244 }
245
246 char* number_start = lexeme->data + 2 * (base != 10);
247 token->value = strtoll(number_start, NULL, base);
248
249 DPRINTF3("Added token `%s` at %s:%d:%d.\n", lexeme->data, module->filepath, line, column);
250 return lexeme->length;
251}
252
253static inline unsigned int
254_mod_read_string(module_t* module, unsigned int line, unsigned int column)
255{
256 int ch;
257 char message[128];
258 dstring_t* lexeme;
260
261 token_init(token, "", TK_STRING, line, column);
262 lexeme = &token->lexeme;
263
264 module_getc(module, &ch); // Remove '"'
265
266 module_getc(module, &ch);
267 while (ch != '"') {
268 if (ch == '\\') {
269 module_getc(module, &ch);
270 ch = _char_escape(ch);
271 }
272
273 dstring_putchar(lexeme, ch);
274
275 if (module_getc(module, &ch) != ERR_OK) {
276 sprintf(message, "End of file reached when trying to find the end of the string starting at line %d.\n", line);
277
278 module_add_error(module, line, column, 1, message);
279
280 DPRINTF3("Added token `\"%s\"` at %s:%d:%d.\n", lexeme->data, module->filepath, line, column);
281 return lexeme->length + 1;
282 };
283 }
284
285 DPRINTF3("Added token `\"%s\"` at %s:%d:%d.\n", lexeme->data, module->filepath, line, column);
286 return lexeme->length + 2;
287}
288
289static inline unsigned int
290_mod_read_identifier(module_t* module, unsigned int line, unsigned int column)
291{
292 int ch;
293 dstring_t* lexeme;
295
296 token_init(token, "", TK_IDENTIFIER, line, column);
297 lexeme = &token->lexeme;
298
299 while (module_lookup(module, &ch, 0) == ERR_OK && (isalnum(ch) || ch == '_')) {
300 dstring_putchar(lexeme, ch);
301 module_getc(module, &ch);
302 }
303
304 int key = _get_keyword(lexeme->data);
305
306 if (key >= 0) {
308 token->value = key;
309 }
310
311 DPRINTF3("Added token `%s` at %s:%d:%d.\n", lexeme->data, module->filepath, line, column);
312 return lexeme->length;
313}
314
315static int
316_get_keyword(const char* lexeme)
317{
318 for (int i = 0; mya_keywords[i]; i++) {
319 if (! strcmp(lexeme, mya_keywords[i])) {
320 return i;
321 }
322 }
323
324 return -1;
325}
326
327static int
328_get_operator(const char* lexeme)
329{
330 for (int i = 0; mya_operators[i]; i++) {
331 if (! strcmp(lexeme, mya_operators[i])) {
332 return i;
333 }
334 }
335
336 return -1;
337}
338
343static inline unsigned int
344_char2base(int ch)
345{
346 switch (ch) {
347 case 'x':
348 return 16;
349 case 'o':
350 return 8;
351 case 'b':
352 return 2;
353 default:
354 return isdigit(ch) ? 10 : 0;
355 }
356}
357
358static inline bool
359_isbasedigit(unsigned int base, int ch)
360{
361 switch (base) {
362 case 16:
363 return isxdigit(ch);
364 case 10:
365 return isdigit(ch);
366 case 8:
367 return isdigit(ch) && (ch - '0') <= 7;
368 case 2:
369 return ch == '1' || ch == '0';
370 default:
371 return false;
372 }
373}
374
375int
376_char_escape(int ch)
377{
378 switch (ch) {
379 case 'n':
380 return '\n';
381 case 'r':
382 return '\r';
383 case '0':
384 return '\0';
385 default:
386 return ch;
387 }
388}
#define DPRINTF3(fmt,...)
Definition debug.h:29
void dstring_putchar(dstring_t *string, int character)
Concatenates a character on the end of the dstring.
Definition dstring.c:25
enum error_code error_code_t
Enumeration of error codes.
@ ERR_OK
Definition err.h:15
const char * mya_operators[]
Definition globals.c:21
const char * mya_keywords[]
Definition globals.c:10
error_code_t mya_lexer(module_t *module)
Make the lexical analysis on the given module.
Definition lexer.c:53
#define MOD_ADD(lexeme, type)
Definition lexer.c:47
void module_add_error(module_t *module, unsigned int line, unsigned int column, unsigned int length, const char *message)
Add error for the given module.
Definition module.c:92
token_t * module_add_token(module_t *module)
Add a new token to module.
Definition module.c:44
error_code_t module_lookup(module_t *module, int *chret, unsigned int seek)
Get a character on module's file, without removing it from the queue.
Definition module.c:63
error_code_t module_getc(module_t *module, int *chret)
Get next character on module's file, removing it from the queue.
Definition module.c:55
unsigned int length
The length of the string.
Definition dstring.h:13
char * data
Pointer for the raw string content (a normal C string).
Definition dstring.h:12
Struct that represents a Mya module.
Definition module.h:34
char filepath[MODULE_MAX_FILEPATH_SIZE+1]
Module's filepath.
Definition module.h:44
Struct for a Mya token.
Definition token.h:32
long long int value
Integer value of the token on TK_NUMBER tokens.
Definition token.h:36
token_type_t type
Token type.
Definition token.h:33
dstring_t lexeme
Lexeme of the token.
Definition token.h:37
void token_init(token_t *token, const char *lexeme, token_type_t type, unsigned int line, unsigned int column)
Initializes a token struct.
Definition token.c:7
struct dstring dstring_t
A dynamic string (dstring) that automatically reallocate her buffer when needed.
struct module module_t
Struct that represents a Mya module.
struct token token_t
Struct for a Mya token.
@ TK_OPEN_BRACKET
Definition token.h:21
@ TK_NUMBER
Definition token.h:19
@ TK_STRING
Definition token.h:25
@ TK_IDENTIFIER
Definition token.h:17
@ TK_COLON
Definition token.h:13
@ TK_OPEN_BRACES
Definition token.h:20
@ TK_OPEN_PARENS
Definition token.h:22
@ TK_COMMA
Definition token.h:14
@ TK_CLOSE_PARENS
Definition token.h:12
@ TK_KEYWORD
Definition token.h:18
@ TK_OPERATOR
Definition token.h:23
@ TK_CLOSE_BRACKET
Definition token.h:11
@ TK_EOF
Definition token.h:15
@ TK_CLOSE_BRACES
Definition token.h:10
@ TK_EQUAL
Definition token.h:16
@ TK_SEMICOLON
Definition token.h:24