From: Lukas Jiriste Date: Sun, 23 Jun 2024 17:08:37 +0000 (+0200) Subject: Implement tokenization X-Git-Url: https://git.ljiriste.work/?a=commitdiff_plain;h=a850c44dba0eceb13319cd42d0981f443d18ce21;p=42%2Fminishell.git Implement tokenization The tokenization seems to work now. Thanks to it the parsing can be tested which shows a leak with the unfreed memory originating from the follow_rule function. --- diff --git a/inc/minishell.h b/inc/minishell.h index 100011c..b10c139 100644 --- a/inc/minishell.h +++ b/inc/minishell.h @@ -6,7 +6,7 @@ /* By: ljiriste +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/05/02 13:22:57 by ljiriste #+# #+# */ -/* Updated: 2024/06/21 16:35:59 by ljiriste ### ########.fr */ +/* Updated: 2024/06/23 18:24:04 by ljiriste ### ########.fr */ /* */ /* ************************************************************************** */ @@ -28,9 +28,9 @@ int add_var_line(t_vec *vec, const char *line); int add_var(t_vec *vec, const char *key, const char *value); void clean_vars(t_vars *vars); -void handle_input(const char *line, t_vars *vars); +void handle_input(char **line, t_vars *vars); -int tokenize(const char *line, t_vec *tokens); +int tokenize(char **line, t_vec *tokens); int parse(t_vec *tokens, t_tree *parse_tree); int expand(t_tree *parse_tree, t_vars *vars); int execute(t_tree *parse_tree, t_vars *vars); diff --git a/src/input_handling.c b/src/input_handling.c index 1c1a763..fa87b5b 100644 --- a/src/input_handling.c +++ b/src/input_handling.c @@ -6,7 +6,7 @@ /* By: ljiriste +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/05/03 09:00:00 by ljiriste #+# #+# */ -/* Updated: 2024/06/21 16:39:27 by ljiriste ### ########.fr */ +/* Updated: 2024/06/23 18:04:27 by ljiriste ### ########.fr */ /* */ /* ************************************************************************** */ @@ -14,7 +14,7 @@ #include "libft.h" #include -void handle_input(const char *input, t_vars *vars) +void handle_input(char **input, t_vars *vars) { int res; t_vec tokens; diff --git a/src/main.c b/src/main.c index 4182246..4d1fc49 100644 --- a/src/main.c +++ b/src/main.c @@ -6,7 +6,7 @@ /* By: ljiriste +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/04/26 13:11:47 by ljiriste #+# #+# */ -/* Updated: 2024/05/03 08:59:30 by ljiriste ### ########.fr */ +/* Updated: 2024/06/23 18:55:39 by ljiriste ### ########.fr */ /* */ /* ************************************************************************** */ @@ -86,9 +86,9 @@ int main(int argc, __attribute__((unused)) char **argv, char **envp) while (1) { line = rl_get_line(); - if (!line || !ft_strcmp(line, "exit")) + if (!line || !ft_strncmp(line, "exit", 4)) break ; - handle_input(line, &vars); + handle_input(&line, &vars); free(line); } rl_clear_history(); @@ -117,9 +117,7 @@ char *get_line(void) ft_printf("\n"); return (line); } - if (line[ft_strlen(line) - 1] == '\n') - line[ft_strlen(line) - 1] = '\0'; - else + if (line[ft_strlen(line) - 1] != '\n') ft_printf("\n"); return (line); } @@ -142,9 +140,9 @@ int main(int argc, __attribute__((unused)) char **argv, char **envp) while (1) { line = get_line(); - if (!line || !ft_strcmp(line, "exit")) + if (!line || !ft_strncmp(line, "exit", 4)) break ; - handle_input(line, &vars); + handle_input(&line, &vars); free(line); } clean_vars(&vars); diff --git a/src/tokenization.c b/src/tokenization.c index 6324d31..d92a65a 100644 --- a/src/tokenization.c +++ b/src/tokenization.c @@ -1,35 +1,275 @@ /* ************************************************************************** */ /* */ /* ::: :::::::: */ -/* tokenize.c :+: :+: :+: */ +/* tokenization.c :+: :+: :+: */ /* +:+ +:+ +:+ */ /* By: ljiriste +#+ +:+ +#+ */ /* +#+#+#+#+#+ +#+ */ /* Created: 2024/06/21 16:34:43 by ljiriste #+# #+# */ -/* Updated: 2024/06/21 16:35:59 by ljiriste ### ########.fr */ +/* Updated: 2024/06/23 18:54:02 by ljiriste ### ########.fr */ /* */ /* ************************************************************************** */ #include "minishell.h" +#include -static int delimit(__attribute__((unused))const char *line, __attribute__((unused))t_vec *tokens) +#ifndef NOLEAKS +# include // readline +# include // readline +# include // readline +#else // NOLEAKS +# include +#endif // NOLEAKS + +static int is_operator_start(char *str, size_t size) { - ft_printf("Function delimit has to be implemented.\n"); - return (0); + if (!str) + return (1); + return (!ft_strncmp(str, "&&", size) + || !ft_strncmp(str, "||", size) + || !ft_strncmp(str, "<", size) + || !ft_strncmp(str, ">", size) + || !ft_strncmp(str, "<<", size) + || !ft_strncmp(str, ">>", size) + || !ft_strncmp(str, "|", size) + || !ft_strncmp(str, "(", size) + || !ft_strncmp(str, ")", size)); +} + +static int is_operator(t_vec *current_token) +{ + char *str; + int res; + + ft_vec_append(current_token, ""); + str = current_token->vec; + res = (!ft_strcmp(str, "&&") + || !ft_strcmp(str, "||") + || !ft_strcmp(str, "<") + || !ft_strcmp(str, ">") + || !ft_strcmp(str, "<<") + || !ft_strcmp(str, ">>") + || !ft_strcmp(str, "|") + || !ft_strcmp(str, "(") + || !ft_strcmp(str, ")")); + ft_vec_erase(current_token, current_token->size - 1, NULL); + return (res); +} + +static int can_expand_operator(t_vec *current_token, char c) +{ + int res; + + ft_vec_append(current_token, &c); + res = is_operator_start(current_token->vec, current_token->size); + ft_vec_erase(current_token, current_token->size - 1, NULL); + return (res); } -static int identify(__attribute__((unused))t_vec *tokens) +static const char *g_tokens[] = { + "WORD", + "ASSIGNMENT_WORD", + "IO_NUMBER", + "AND_IF", + "OR_IF", + "LESS", + "GREAT", + "DLESS", + "DGREAT", + "PIPE", + "LPARA", + "RPARA"}; + +enum token_types +{ + WORD, + ASSIGNMENT_WORD, + IO_NUMBER, + AND_IF, + OR_IF, + LESS, + GREAT, + DLESS, + DGREAT, + PIPE, + LPARA, + RPARA, +}; + +static int is_assignment_word(const char *str) { - ft_printf("Function identify has to be implemented.\n"); + size_t i; + size_t j; + + i = 0; + while (str[i]) + { + if (str[i] == '"') + while (str[i] && str[i] != '"') + ++i; + else if (str[i] == '\'') + while (str[i] && str[i] != '\'') + ++i; + else if (str[i] == '\\') + ++i; + else if (str[i] == '=') + { + j = 0; + if (ft_isdigit(str[0])) + return (0); + while (j < i) + { + if (!ft_isalnum(str[j]) && str[j] != '_') + return (0); + ++j; + } + return (1); + } + ++i; + } return (0); } +int only_contains_digits(const char *str) +{ + while (ft_isdigit(*str)) + ++str; + return (*str == '\0'); +} + +const char *get_token_type(const char *str, char next) +{ + if (!ft_strcmp(str, "&&")) + return (g_tokens[AND_IF]); + if (!ft_strcmp(str, "||")) + return (g_tokens[OR_IF]); + if (!ft_strcmp(str, "<")) + return (g_tokens[LESS]); + if (!ft_strcmp(str, ">")) + return (g_tokens[GREAT]); + if (!ft_strcmp(str, "<<")) + return (g_tokens[DLESS]); + if (!ft_strcmp(str, ">>")) + return (g_tokens[DGREAT]); + if (!ft_strcmp(str, "|")) + return (g_tokens[PIPE]); + if (!ft_strcmp(str, "(")) + return (g_tokens[LPARA]); + if (!ft_strcmp(str, ")")) + return (g_tokens[RPARA]); + if (is_assignment_word(str)) + return (g_tokens[ASSIGNMENT_WORD]); + if (only_contains_digits(str) && (next == '<' || next == '>')) + return (g_tokens[IO_NUMBER]); + return (g_tokens[WORD]); +} + +#ifndef NOLEAKS + +char *continue_input(char *line, size_t *i) +{ + free(line); + *i = 0; + return (readline("> ")); +} + +#else //NOLEAKS + +char *continue_input(char *line, size_t *i) +{ + free(line); + *i = 0; + return (get_next_line(STDIN_FILENO)); +} + +#endif //NOLEAKS + +void handle_quote(t_vec *current_token, char **line, char quote_char, size_t *i) +{ + if (quote_char == '\\') + { + ++*i; + if (line[0][*i] == '\n') + *line = continue_input(*line, i); + else + ft_vec_append(current_token, line[0] + (*i)++); + return ; + } + while (line[0][*i] != quote_char) + { + if (!line[0][*i]) + *line = continue_input(*line, i); + else + ft_vec_append(current_token, line[0] + (*i)++); + } + return ; +} + // This function turns the input char string into a string of tokens -int tokenize(const char *line, t_vec *tokens) +// It possibly should use ft_strdup(ft_vec_access(¤t_token, 0)) +// as that only relies on the consecutivness of memory of t_vec +int tokenize(char **line, t_vec *tokens) { - if (delimit(line, tokens)) - return (1); - if (identify(tokens)) - return (1); + t_vec current_token; + t_token token; + size_t i; + + ft_vec_init(¤t_token, sizeof(char)); + i = 0; + while (line[0][i]) + { + if (is_operator_start(current_token.vec, current_token.size) && can_expand_operator(¤t_token, line[0][i])) + { + ft_vec_append(¤t_token, line[0] + i); + ++i; + } + else if (is_operator(¤t_token)) + { + ft_vec_append(¤t_token, ""); + token.type = (char *)get_token_type(current_token.vec, '\0'); + token.str = current_token.vec; + ft_vec_append(tokens, &token); + ft_vec_init(¤t_token, sizeof(char)); + } + else if (line[0][i] == '\'') + handle_quote(¤t_token, line, '\'', &i); + else if (line[0][i] == '"') + handle_quote(¤t_token, line, '"', &i); + else if (line[0][i] == '\\') + handle_quote(¤t_token, line, '\\', &i); + else if (is_operator_start(line[0] + i, 1) || ft_isspace(line[0][i])) + { + if (current_token.size > 0) + { + ft_vec_append(¤t_token, ""); + token.type = (char *)get_token_type(current_token.vec, line[0][i]); + token.str = current_token.vec; + ft_vec_append(tokens, &token); + ft_vec_init(¤t_token, sizeof(char)); + } + if (!ft_isspace(line[0][i])) + ft_vec_append(¤t_token, line[0] + i); + ++i; + } + else if (current_token.size > 0) + { + ft_vec_append(¤t_token, line[0] + i); + ++i; + } + else if (line[0][i] == '#') + break ; + else + { + ft_vec_append(¤t_token, line[0] + i); + ++i; + } + } + if (current_token.size > 0) + { + ft_vec_append(¤t_token, ""); + token.type = (char *)get_token_type(current_token.vec, '\0'); + token.str = current_token.vec; + ft_vec_append(tokens, &token); + } return (0); }