Implement tokenization
authorLukas Jiriste <ljiriste@student.42prague.com>
Sun, 23 Jun 2024 17:08:37 +0000 (19:08 +0200)
committerLukas Jiriste <ljiriste@student.42prague.com>
Sun, 23 Jun 2024 17:08:37 +0000 (19:08 +0200)
The tokenization seems to work now. Thanks to it the parsing can be
tested which shows a leak with the unfreed memory originating from the
follow_rule function.

inc/minishell.h
src/input_handling.c
src/main.c
src/tokenization.c

index 100011ca68e5754e3d19cba6efe75340f575f66d..b10c13935e546cc715fe7123d0c287c640badae2 100644 (file)
@@ -6,7 +6,7 @@
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/05/02 13:22:57 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/06/21 16:35:59 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:24:04 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
@@ -28,9 +28,9 @@ int           add_var_line(t_vec *vec, const char *line);
 int            add_var(t_vec *vec, const char *key, const char *value);
 void   clean_vars(t_vars *vars);
 
-void   handle_input(const char *line, t_vars *vars);
+void   handle_input(char **line, t_vars *vars);
 
-int            tokenize(const char *line, t_vec *tokens);
+int            tokenize(char **line, t_vec *tokens);
 int            parse(t_vec *tokens, t_tree *parse_tree);
 int            expand(t_tree *parse_tree, t_vars *vars);
 int            execute(t_tree *parse_tree, t_vars *vars);
index 1c1a763320aa7e58fc97ef31a4756b7b8507cea4..fa87b5b0882d790ff0e57992ecdde909d221bf8c 100644 (file)
@@ -6,7 +6,7 @@
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/05/03 09:00:00 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/06/21 16:39:27 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:04:27 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
@@ -14,7 +14,7 @@
 #include "libft.h"
 #include <stdlib.h>
 
-void   handle_input(const char *input, t_vars *vars)
+void   handle_input(char **input, t_vars *vars)
 {
        int             res;
        t_vec   tokens;
index 4182246709be3c97c22fead54601b13ac516af7e..4d1fc496ada29314873c0f1bec97533db433c0c5 100644 (file)
@@ -6,7 +6,7 @@
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/04/26 13:11:47 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/05/03 08:59:30 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:55:39 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
@@ -86,9 +86,9 @@ int   main(int argc, __attribute__((unused)) char **argv, char **envp)
        while (1)
        {
                line = rl_get_line();
-               if (!line || !ft_strcmp(line, "exit"))
+               if (!line || !ft_strncmp(line, "exit", 4))
                        break ;
-               handle_input(line, &vars);
+               handle_input(&line, &vars);
                free(line);
        }
        rl_clear_history();
@@ -117,9 +117,7 @@ char        *get_line(void)
                ft_printf("\n");
                return (line);
        }
-       if (line[ft_strlen(line) - 1] == '\n')
-               line[ft_strlen(line) - 1] = '\0';
-       else
+       if (line[ft_strlen(line) - 1] != '\n')
                ft_printf("\n");
        return (line);
 }
@@ -142,9 +140,9 @@ int main(int argc, __attribute__((unused)) char **argv, char **envp)
        while (1)
        {
                line = get_line();
-               if (!line || !ft_strcmp(line, "exit"))
+               if (!line || !ft_strncmp(line, "exit", 4))
                        break ;
-               handle_input(line, &vars);
+               handle_input(&line, &vars);
                free(line);
        }
        clean_vars(&vars);
index 6324d31ed5626cf513dbbcedcfd9117cc169ba8f..d92a65aff697578cd24ef9db762ca2abe54e309d 100644 (file)
 /* ************************************************************************** */
 /*                                                                            */
 /*                                                        :::      ::::::::   */
-/*   tokenize.c                                         :+:      :+:    :+:   */
+/*   tokenization.c                                     :+:      :+:    :+:   */
 /*                                                    +:+ +:+         +:+     */
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/06/21 16:34:43 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/06/21 16:35:59 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:54:02 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
 #include "minishell.h"
+#include <stdlib.h>
 
-static int     delimit(__attribute__((unused))const char *line, __attribute__((unused))t_vec *tokens)
+#ifndef NOLEAKS
+# include <stdio.h>                            // readline
+# include <readline/readline.h>        // readline
+# include <readline/history.h> // readline
+#else // NOLEAKS
+# include <unistd.h>
+#endif // NOLEAKS
+
+static int     is_operator_start(char *str, size_t size)
 {
-       ft_printf("Function delimit has to be implemented.\n");
-       return (0);
+       if (!str)
+               return (1);
+       return (!ft_strncmp(str, "&&", size)
+                       || !ft_strncmp(str, "||", size)
+                       || !ft_strncmp(str, "<", size)
+                       || !ft_strncmp(str, ">", size)
+                       || !ft_strncmp(str, "<<", size)
+                       || !ft_strncmp(str, ">>", size)
+                       || !ft_strncmp(str, "|", size)
+                       || !ft_strncmp(str, "(", size)
+                       || !ft_strncmp(str, ")", size));
+}
+
+static int     is_operator(t_vec *current_token)
+{
+       char    *str;
+       int             res;
+
+       ft_vec_append(current_token, "");
+       str = current_token->vec;
+       res = (!ft_strcmp(str, "&&")
+                       || !ft_strcmp(str, "||")
+                       || !ft_strcmp(str, "<")
+                       || !ft_strcmp(str, ">")
+                       || !ft_strcmp(str, "<<")
+                       || !ft_strcmp(str, ">>")
+                       || !ft_strcmp(str, "|")
+                       || !ft_strcmp(str, "(")
+                       || !ft_strcmp(str, ")"));
+       ft_vec_erase(current_token, current_token->size - 1, NULL);
+       return (res);
+}
+
+static int     can_expand_operator(t_vec *current_token, char c)
+{
+       int     res;
+
+       ft_vec_append(current_token, &c);
+       res = is_operator_start(current_token->vec, current_token->size);
+       ft_vec_erase(current_token, current_token->size - 1, NULL);
+       return (res);
 }
 
-static int     identify(__attribute__((unused))t_vec *tokens)
+static const char      *g_tokens[] = {
+       "WORD",
+       "ASSIGNMENT_WORD",
+       "IO_NUMBER",
+       "AND_IF",
+       "OR_IF",
+       "LESS",
+       "GREAT",
+       "DLESS",
+       "DGREAT",
+       "PIPE",
+       "LPARA",
+       "RPARA"};
+
+enum token_types
+{
+       WORD,
+       ASSIGNMENT_WORD,
+       IO_NUMBER,
+       AND_IF,
+       OR_IF,
+       LESS,
+       GREAT,
+       DLESS,
+       DGREAT,
+       PIPE,
+       LPARA,
+       RPARA,
+};
+
+static int     is_assignment_word(const char *str)
 {
-       ft_printf("Function identify has to be implemented.\n");
+       size_t  i;
+       size_t  j;
+
+       i = 0;
+       while (str[i])
+       {
+               if (str[i] == '"')
+                       while (str[i] && str[i] != '"')
+                               ++i;
+               else if (str[i] == '\'')
+                       while (str[i] && str[i] != '\'')
+                               ++i;
+               else if (str[i] == '\\')
+                       ++i;
+               else if (str[i] == '=')
+               {
+                       j = 0;
+                       if (ft_isdigit(str[0]))
+                               return (0);
+                       while (j < i)
+                       {
+                               if (!ft_isalnum(str[j]) && str[j] != '_')
+                                       return (0);
+                               ++j;
+                       }
+                       return (1);
+               }
+               ++i;
+       }
        return (0);
 }
 
+int    only_contains_digits(const char *str)
+{
+       while (ft_isdigit(*str))
+               ++str;
+       return (*str == '\0');
+}
+
+const char     *get_token_type(const char *str, char next)
+{
+       if (!ft_strcmp(str, "&&"))
+               return (g_tokens[AND_IF]);
+       if (!ft_strcmp(str, "||"))
+               return (g_tokens[OR_IF]);
+       if (!ft_strcmp(str, "<"))
+               return (g_tokens[LESS]);
+       if (!ft_strcmp(str, ">"))
+               return (g_tokens[GREAT]);
+       if (!ft_strcmp(str, "<<"))
+               return (g_tokens[DLESS]);
+       if (!ft_strcmp(str, ">>"))
+               return (g_tokens[DGREAT]);
+       if (!ft_strcmp(str, "|"))
+               return (g_tokens[PIPE]);
+       if (!ft_strcmp(str, "("))
+               return (g_tokens[LPARA]);
+       if (!ft_strcmp(str, ")"))
+               return (g_tokens[RPARA]);
+       if (is_assignment_word(str))
+               return (g_tokens[ASSIGNMENT_WORD]);
+       if (only_contains_digits(str) && (next == '<' || next == '>'))
+               return (g_tokens[IO_NUMBER]);
+       return (g_tokens[WORD]);
+}
+
+#ifndef NOLEAKS
+
+char   *continue_input(char *line, size_t *i)
+{
+                       free(line);
+                       *i = 0;
+                       return (readline("> "));
+}
+
+#else //NOLEAKS
+
+char   *continue_input(char *line, size_t *i)
+{
+       free(line);
+       *i = 0;
+       return (get_next_line(STDIN_FILENO));
+}
+
+#endif //NOLEAKS
+
+void   handle_quote(t_vec *current_token, char **line, char quote_char, size_t *i)
+{
+       if (quote_char == '\\')
+       {
+               ++*i;
+               if (line[0][*i] == '\n')
+                       *line = continue_input(*line, i);
+               else
+                       ft_vec_append(current_token, line[0] + (*i)++);
+               return ;
+       }
+       while (line[0][*i] != quote_char)
+       {
+               if (!line[0][*i])
+                       *line = continue_input(*line, i);
+               else
+                       ft_vec_append(current_token, line[0] + (*i)++);
+       }
+       return ;
+}
+
 //     This function turns the input char string into a string of tokens
-int    tokenize(const char *line, t_vec *tokens)
+//     It possibly should use ft_strdup(ft_vec_access(&current_token, 0))
+//     as that only relies on the consecutivness of memory of t_vec
+int    tokenize(char **line, t_vec *tokens)
 {
-       if (delimit(line, tokens))
-               return (1);
-       if (identify(tokens))
-               return (1);
+       t_vec   current_token;
+       t_token token;
+       size_t  i;
+
+       ft_vec_init(&current_token, sizeof(char));
+       i = 0;
+       while (line[0][i])
+       {
+               if (is_operator_start(current_token.vec, current_token.size) && can_expand_operator(&current_token, line[0][i]))
+               {
+                       ft_vec_append(&current_token, line[0] + i);
+                       ++i;
+               }
+               else if (is_operator(&current_token))
+               {
+                       ft_vec_append(&current_token, "");
+                       token.type = (char *)get_token_type(current_token.vec, '\0');
+                       token.str = current_token.vec;
+                       ft_vec_append(tokens, &token);
+                       ft_vec_init(&current_token, sizeof(char));
+               }
+               else if (line[0][i] == '\'')
+                       handle_quote(&current_token, line, '\'', &i);
+               else if (line[0][i] == '"')
+                       handle_quote(&current_token, line, '"', &i);
+               else if (line[0][i] == '\\')
+                       handle_quote(&current_token, line, '\\', &i);
+               else if (is_operator_start(line[0] + i, 1) || ft_isspace(line[0][i]))
+               {
+                       if (current_token.size > 0)
+                       {
+                               ft_vec_append(&current_token, "");
+                               token.type = (char *)get_token_type(current_token.vec, line[0][i]);
+                               token.str = current_token.vec;
+                               ft_vec_append(tokens, &token);
+                               ft_vec_init(&current_token, sizeof(char));
+                       }
+                       if (!ft_isspace(line[0][i]))
+                               ft_vec_append(&current_token, line[0] + i);
+                       ++i;
+               }
+               else if (current_token.size > 0)
+               {
+                       ft_vec_append(&current_token, line[0] + i);
+                       ++i;
+               }
+               else if (line[0][i] == '#')
+                       break ;
+               else
+               {
+                       ft_vec_append(&current_token, line[0] + i);
+                       ++i;
+               }
+       }
+       if (current_token.size > 0)
+       {
+               ft_vec_append(&current_token, "");
+               token.type = (char *)get_token_type(current_token.vec, '\0');
+               token.str = current_token.vec;
+               ft_vec_append(tokens, &token);
+       }
        return (0);
 }