From: Lukas Jiriste <redacted>
Date: Sun, 23 Jun 2024 17:08:37 +0000 (+0200)
Subject: Implement tokenization
X-Git-Url: https://git.ljiriste.work/?a=commitdiff_plain;h=a850c44dba0eceb13319cd42d0981f443d18ce21;p=42%2Fminishell.git

Implement tokenization

The tokenization seems to work now. Thanks to it the parsing can be
tested which shows a leak with the unfreed memory originating from the
follow_rule function.
---

diff --git a/inc/minishell.h b/inc/minishell.h
index 100011c..b10c139 100644
--- a/inc/minishell.h
+++ b/inc/minishell.h
@@ -6,7 +6,7 @@
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/05/02 13:22:57 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/06/21 16:35:59 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:24:04 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
@@ -28,9 +28,9 @@ int		add_var_line(t_vec *vec, const char *line);
 int		add_var(t_vec *vec, const char *key, const char *value);
 void	clean_vars(t_vars *vars);
 
-void	handle_input(const char *line, t_vars *vars);
+void	handle_input(char **line, t_vars *vars);
 
-int		tokenize(const char *line, t_vec *tokens);
+int		tokenize(char **line, t_vec *tokens);
 int		parse(t_vec *tokens, t_tree *parse_tree);
 int		expand(t_tree *parse_tree, t_vars *vars);
 int		execute(t_tree *parse_tree, t_vars *vars);
diff --git a/src/input_handling.c b/src/input_handling.c
index 1c1a763..fa87b5b 100644
--- a/src/input_handling.c
+++ b/src/input_handling.c
@@ -6,7 +6,7 @@
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/05/03 09:00:00 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/06/21 16:39:27 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:04:27 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
@@ -14,7 +14,7 @@
 #include "libft.h"
 #include <stdlib.h>
 
-void	handle_input(const char *input, t_vars *vars)
+void	handle_input(char **input, t_vars *vars)
 {
 	int		res;
 	t_vec	tokens;
diff --git a/src/main.c b/src/main.c
index 4182246..4d1fc49 100644
--- a/src/main.c
+++ b/src/main.c
@@ -6,7 +6,7 @@
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/04/26 13:11:47 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/05/03 08:59:30 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:55:39 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
@@ -86,9 +86,9 @@ int	main(int argc, __attribute__((unused)) char **argv, char **envp)
 	while (1)
 	{
 		line = rl_get_line();
-		if (!line || !ft_strcmp(line, "exit"))
+		if (!line || !ft_strncmp(line, "exit", 4))
 			break ;
-		handle_input(line, &vars);
+		handle_input(&line, &vars);
 		free(line);
 	}
 	rl_clear_history();
@@ -117,9 +117,7 @@ char	*get_line(void)
 		ft_printf("\n");
 		return (line);
 	}
-	if (line[ft_strlen(line) - 1] == '\n')
-		line[ft_strlen(line) - 1] = '\0';
-	else
+	if (line[ft_strlen(line) - 1] != '\n')
 		ft_printf("\n");
 	return (line);
 }
@@ -142,9 +140,9 @@ int	main(int argc, __attribute__((unused)) char **argv, char **envp)
 	while (1)
 	{
 		line = get_line();
-		if (!line || !ft_strcmp(line, "exit"))
+		if (!line || !ft_strncmp(line, "exit", 4))
 			break ;
-		handle_input(line, &vars);
+		handle_input(&line, &vars);
 		free(line);
 	}
 	clean_vars(&vars);
diff --git a/src/tokenization.c b/src/tokenization.c
index 6324d31..d92a65a 100644
--- a/src/tokenization.c
+++ b/src/tokenization.c
@@ -1,35 +1,275 @@
 /* ************************************************************************** */
 /*                                                                            */
 /*                                                        :::      ::::::::   */
-/*   tokenize.c                                         :+:      :+:    :+:   */
+/*   tokenization.c                                     :+:      :+:    :+:   */
 /*                                                    +:+ +:+         +:+     */
 /*   By: ljiriste <marvin@42.fr>                    +#+  +:+       +#+        */
 /*                                                +#+#+#+#+#+   +#+           */
 /*   Created: 2024/06/21 16:34:43 by ljiriste          #+#    #+#             */
-/*   Updated: 2024/06/21 16:35:59 by ljiriste         ###   ########.fr       */
+/*   Updated: 2024/06/23 18:54:02 by ljiriste         ###   ########.fr       */
 /*                                                                            */
 /* ************************************************************************** */
 
 #include "minishell.h"
+#include <stdlib.h>
 
-static int	delimit(__attribute__((unused))const char *line, __attribute__((unused))t_vec *tokens)
+#ifndef NOLEAKS
+# include <stdio.h>				// readline
+# include <readline/readline.h>	// readline
+# include <readline/history.h>	// readline
+#else // NOLEAKS
+# include <unistd.h>
+#endif // NOLEAKS
+
+static int	is_operator_start(char *str, size_t size)
 {
-	ft_printf("Function delimit has to be implemented.\n");
-	return (0);
+	if (!str)
+		return (1);
+	return (!ft_strncmp(str, "&&", size)
+			|| !ft_strncmp(str, "||", size)
+			|| !ft_strncmp(str, "<", size)
+			|| !ft_strncmp(str, ">", size)
+			|| !ft_strncmp(str, "<<", size)
+			|| !ft_strncmp(str, ">>", size)
+			|| !ft_strncmp(str, "|", size)
+			|| !ft_strncmp(str, "(", size)
+			|| !ft_strncmp(str, ")", size));
+}
+
+static int	is_operator(t_vec *current_token)
+{
+	char	*str;
+	int		res;
+
+	ft_vec_append(current_token, "");
+	str = current_token->vec;
+	res = (!ft_strcmp(str, "&&")
+			|| !ft_strcmp(str, "||")
+			|| !ft_strcmp(str, "<")
+			|| !ft_strcmp(str, ">")
+			|| !ft_strcmp(str, "<<")
+			|| !ft_strcmp(str, ">>")
+			|| !ft_strcmp(str, "|")
+			|| !ft_strcmp(str, "(")
+			|| !ft_strcmp(str, ")"));
+	ft_vec_erase(current_token, current_token->size - 1, NULL);
+	return (res);
+}
+
+static int	can_expand_operator(t_vec *current_token, char c)
+{
+	int	res;
+
+	ft_vec_append(current_token, &c);
+	res = is_operator_start(current_token->vec, current_token->size);
+	ft_vec_erase(current_token, current_token->size - 1, NULL);
+	return (res);
 }
 
-static int	identify(__attribute__((unused))t_vec *tokens)
+static const char	*g_tokens[] = {
+	"WORD",
+	"ASSIGNMENT_WORD",
+	"IO_NUMBER",
+	"AND_IF",
+	"OR_IF",
+	"LESS",
+	"GREAT",
+	"DLESS",
+	"DGREAT",
+	"PIPE",
+	"LPARA",
+	"RPARA"};
+
+enum token_types
+{
+	WORD,
+	ASSIGNMENT_WORD,
+	IO_NUMBER,
+	AND_IF,
+	OR_IF,
+	LESS,
+	GREAT,
+	DLESS,
+	DGREAT,
+	PIPE,
+	LPARA,
+	RPARA,
+};
+
+static int	is_assignment_word(const char *str)
 {
-	ft_printf("Function identify has to be implemented.\n");
+	size_t	i;
+	size_t	j;
+
+	i = 0;
+	while (str[i])
+	{
+		if (str[i] == '"')
+			while (str[i] && str[i] != '"')
+				++i;
+		else if (str[i] == '\'')
+			while (str[i] && str[i] != '\'')
+				++i;
+		else if (str[i] == '\\')
+			++i;
+		else if (str[i] == '=')
+		{
+			j = 0;
+			if (ft_isdigit(str[0]))
+				return (0);
+			while (j < i)
+			{
+				if (!ft_isalnum(str[j]) && str[j] != '_')
+					return (0);
+				++j;
+			}
+			return (1);
+		}
+		++i;
+	}
 	return (0);
 }
 
+int	only_contains_digits(const char *str)
+{
+	while (ft_isdigit(*str))
+		++str;
+	return (*str == '\0');
+}
+
+const char	*get_token_type(const char *str, char next)
+{
+	if (!ft_strcmp(str, "&&"))
+		return (g_tokens[AND_IF]);
+	if (!ft_strcmp(str, "||"))
+		return (g_tokens[OR_IF]);
+	if (!ft_strcmp(str, "<"))
+		return (g_tokens[LESS]);
+	if (!ft_strcmp(str, ">"))
+		return (g_tokens[GREAT]);
+	if (!ft_strcmp(str, "<<"))
+		return (g_tokens[DLESS]);
+	if (!ft_strcmp(str, ">>"))
+		return (g_tokens[DGREAT]);
+	if (!ft_strcmp(str, "|"))
+		return (g_tokens[PIPE]);
+	if (!ft_strcmp(str, "("))
+		return (g_tokens[LPARA]);
+	if (!ft_strcmp(str, ")"))
+		return (g_tokens[RPARA]);
+	if (is_assignment_word(str))
+		return (g_tokens[ASSIGNMENT_WORD]);
+	if (only_contains_digits(str) && (next == '<' || next == '>'))
+		return (g_tokens[IO_NUMBER]);
+	return (g_tokens[WORD]);
+}
+
+#ifndef NOLEAKS
+
+char	*continue_input(char *line, size_t *i)
+{
+			free(line);
+			*i = 0;
+			return (readline("> "));
+}
+
+#else //NOLEAKS
+
+char	*continue_input(char *line, size_t *i)
+{
+	free(line);
+	*i = 0;
+	return (get_next_line(STDIN_FILENO));
+}
+
+#endif //NOLEAKS
+
+void	handle_quote(t_vec *current_token, char **line, char quote_char, size_t *i)
+{
+	if (quote_char == '\\')
+	{
+		++*i;
+		if (line[0][*i] == '\n')
+			*line = continue_input(*line, i);
+		else
+			ft_vec_append(current_token, line[0] + (*i)++);
+		return ;
+	}
+	while (line[0][*i] != quote_char)
+	{
+		if (!line[0][*i])
+			*line = continue_input(*line, i);
+		else
+			ft_vec_append(current_token, line[0] + (*i)++);
+	}
+	return ;
+}
+
 //	This function turns the input char string into a string of tokens
-int	tokenize(const char *line, t_vec *tokens)
+//	It possibly should use ft_strdup(ft_vec_access(&current_token, 0))
+//	as that only relies on the consecutivness of memory of t_vec
+int	tokenize(char **line, t_vec *tokens)
 {
-	if (delimit(line, tokens))
-		return (1);
-	if (identify(tokens))
-		return (1);
+	t_vec	current_token;
+	t_token	token;
+	size_t	i;
+
+	ft_vec_init(&current_token, sizeof(char));
+	i = 0;
+	while (line[0][i])
+	{
+		if (is_operator_start(current_token.vec, current_token.size) && can_expand_operator(&current_token, line[0][i]))
+		{
+			ft_vec_append(&current_token, line[0] + i);
+			++i;
+		}
+		else if (is_operator(&current_token))
+		{
+			ft_vec_append(&current_token, "");
+			token.type = (char *)get_token_type(current_token.vec, '\0');
+			token.str = current_token.vec;
+			ft_vec_append(tokens, &token);
+			ft_vec_init(&current_token, sizeof(char));
+		}
+		else if (line[0][i] == '\'')
+			handle_quote(&current_token, line, '\'', &i);
+		else if (line[0][i] == '"')
+			handle_quote(&current_token, line, '"', &i);
+		else if (line[0][i] == '\\')
+			handle_quote(&current_token, line, '\\', &i);
+		else if (is_operator_start(line[0] + i, 1) || ft_isspace(line[0][i]))
+		{
+			if (current_token.size > 0)
+			{
+				ft_vec_append(&current_token, "");
+				token.type = (char *)get_token_type(current_token.vec, line[0][i]);
+				token.str = current_token.vec;
+				ft_vec_append(tokens, &token);
+				ft_vec_init(&current_token, sizeof(char));
+			}
+			if (!ft_isspace(line[0][i]))
+				ft_vec_append(&current_token, line[0] + i);
+			++i;
+		}
+		else if (current_token.size > 0)
+		{
+			ft_vec_append(&current_token, line[0] + i);
+			++i;
+		}
+		else if (line[0][i] == '#')
+			break ;
+		else
+		{
+			ft_vec_append(&current_token, line[0] + i);
+			++i;
+		}
+	}
+	if (current_token.size > 0)
+	{
+		ft_vec_append(&current_token, "");
+		token.type = (char *)get_token_type(current_token.vec, '\0');
+		token.str = current_token.vec;
+		ft_vec_append(tokens, &token);
+	}
 	return (0);
 }