logo

utils-std

Collection of commonly available Unix tools
commit: 4bf578acbaab8c77b2c137188fd540d2e55ff573
parent bf534b0ea743b73e32e1d65eb87d67273318d23b
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Fri, 23 Aug 2024 22:33:31 +0200

cmd/cut: new

Diffstat:

Acmd/cut.172++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acmd/cut.c421+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcoreutils.txt2+-
Mlsb_commands.txt2+-
Mposix_utilities.txt2+-
Mtest-cmd/Kyuafile1+
Atest-cmd/cut.sh32++++++++++++++++++++++++++++++++
7 files changed, 529 insertions(+), 3 deletions(-)

diff --git a/cmd/cut.1 b/cmd/cut.1 @@ -0,0 +1,72 @@ +.\" utils-std: Collection of commonly available Unix tools +.\" Copyright 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +.\" SPDX-License-Identifier: MPL-2.0 +.Dd 2024-08-24 +.Dt CUT 1 +.Os +.Sh NAME +.Nm cut +.Nd cut each line with selected characters/fields +.Sh SYNOPSIS +.Nm +.Fl b Ar list +.Op Fl n +.Op Ar file... +.Nm +.Fl c Ar list +.Op Ar file... +.Nm +.Fl f Ar list +.Op Fl d Ar delim +.Op Fl s +.Op Ar file... +.Sh DESCRIPTION +.Nm +reads lines from each each +.Ar file +or if unspecified standard input, +and cuts out bytes +.Pq Fl b , +characters +.Pq Fl c , +or character-delimited fields +.Pq Fl f . +.Pp +The +.Ar list +argument is a comma-separated list of 1-based ranges, where for example +.Ql 1,2,3 +and +.Ql 1-3 +are equivalents. +.Sh OPTIONS +.Bl -tag -width _d_delim +.It Fl b Ar list +Cut bytes based on +.Ar list . +.It Fl c Ar list +Cut codepoints based on +.Ar list . +.It Fl d Ar delim +Set the field delimiter to the character +.Ar delim . +(default: \t) +.It Fl f Ar list +Cut fields based on +.Ar list . +.It Fl n +Do not split codepoints. (Currently unsupported in this implementation) +.It Fl s +Suppress lines with no delimiter characters. +.El +.Sh EXIT STATUS +.Ex -std +.Sh STANDARDS +Except for the lack of support for +.Fl n , +.Nm +should be compliant with the +IEEE Std 1003.1-2024 (“POSIX.1”) +specification. +.Sh AUTHORS +.An Haelwenn (lanodan) Monnier Aq Mt contact+utils@hacktivis.me diff --git a/cmd/cut.c b/cmd/cut.c @@ -0,0 +1,421 @@ +// utils-std: Collection of commonly available Unix tools +// SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +// SPDX-License-Identifier: MPL-2.0 + +#define _POSIX_C_SOURCE 202405L + +#include "../lib/reallocarray.h" + +#include <errno.h> +#include <locale.h> +#include <stdbool.h> +#include <stdint.h> // size_t +#include <stdio.h> // fprintf, fopen +#include <string.h> // strerror +#include <unistd.h> // getopt +#include <wchar.h> + +#undef MIN +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) + +enum cut_mode +{ + CUT_MODE_NONE = 0, + CUT_MODE_B = 1, + CUT_MODE_C = 2, + CUT_MODE_F = 3, +}; + +char delim = '\t'; +bool opt_n = false, opt_s = false; +enum cut_mode mode = CUT_MODE_NONE; +bool *list = NULL; +size_t list_len = 0; + +static ssize_t +parse_list_num(char **s) +{ + char *endptr = NULL; + errno = 0; + size_t n = strtoul(*s, &endptr, 10); + if(errno != 0) + { + fprintf(stderr, "cut: Error while parsing '%s' as a number: %s\n", *s, strerror(errno)); + return -1; + } + + if(n < 1) + { + fprintf(stderr, "cut: Invalid number in list: %zu\n", n); + return -1; + } + + if(endptr != NULL && strchr(",-", *endptr) == NULL) + { + fprintf(stderr, "cut: Invalid character in list: %c\n", *endptr); + return -1; + } + + *s = endptr; + + return n; +} + +static int +parse_list(char *s) +{ + while(true) + { + if(s == NULL || *s == '\0') break; + + if(*s == ',') + { + fprintf(stderr, "cut: Error: empty list element\n"); + return -1; + } + + ssize_t min = parse_list_num(&s); + if(min < 0) return -1; + + // min-- as cut(1) is 1-indexed and max needs to be at least min+1 + ssize_t max = min--; + + if(s && *s == '-') + { + s++; + max = parse_list_num(&s); + if(max < 0) return -1; + + if(max < min) + { + fprintf(stderr, "cut: Error: decreasing range: %zu-%zu\n", min, max); + return -1; + } + } + + // Needs to be after *s == '-' + if(s && *s == ',') s++; + + if((size_t)max > list_len) + { + list = reallocarray(list, max, sizeof(*list)); + if(list == NULL) + { + fprintf(stderr, "cut: Memory allocation error: %s\n", strerror(errno)); + return -1; + } + + if((size_t)min > list_len) + { + memset(list + list_len, 0, min - list_len); + } + + list_len = max; + } + + memset(list + min, 1, max - min); + } + + if(list_len == 0) + { + fprintf(stderr, "cut: Error: empty list\n"); + return -1; + } + + return 0; +} + +static int +cut_b(FILE *in, char *filename) +{ + char *line = NULL; + size_t line_len = 0; + int err = 0; + + while(err == 0) + { + errno = 0; + ssize_t nread = getline(&line, &line_len, in); + + if(nread < 0) + { + if(errno != 0) + { + fprintf(stderr, "cut: Error while reading file '%s': %s\n", filename, strerror(errno)); + err = 1; + } + break; + } + + if(nread == 0) + { + fputc('\n', stdout); + continue; + } + + for(size_t i = 0; i < MIN(list_len, (size_t)nread); i++) + if(list[i]) fputc(line[i], stdout); + + fputc('\n', stdout); + } + + if(line_len != 0) free(line); + + return 0; +} + +static int +cut_c(FILE *in, char *filename) +{ + char *line = NULL; + size_t line_len = 0; + int err = 0; + wchar_t *line_w = NULL; + ssize_t line_wsz = 0; + + while(err == 0) + { + errno = 0; + ssize_t nread = getline(&line, &line_len, in); + + if(nread < 0) + { + if(errno != 0) + { + fprintf(stderr, "cut: Error while reading file '%s': %s\n", filename, strerror(errno)); + err = 1; + } + break; + } + + if(nread == 0) + { + fputc('\n', stdout); + continue; + } + + if(nread > line_wsz) + { + line_w = reallocarray(line_w, nread, sizeof(*line_w)); + if(line_w == NULL) + { + fprintf(stderr, "cut: Memory allocation error: %s\n", strerror(errno)); + return -1; + } + line_wsz = nread; + } + + size_t wcread = mbstowcs(line_w, line, line_wsz); + if(wcread == (size_t)-1) + { + fprintf(stderr, + "cut: Error while parsing characters in file '%s': %s\n", + filename, + strerror(errno)); + err = 1; + break; + } + + //DEBUG fprintf(stderr, "cut: mbstowcs(_, _, %zu) => %zu\n", nread, wcread); + + for(size_t i = 0; i < MIN(list_len, wcread); i++) + if(list[i]) fputwc(line_w[i], stdout); + + fputc('\n', stdout); + } + + if(line_len != 0) free(line); + + return 0; +} + +static int +cut_f(FILE *in, char *filename) +{ + char *line = NULL; + size_t line_len = 0; + int err = 0; + + while(err == 0) + { + errno = 0; + ssize_t nread = getline(&line, &line_len, in); + + if(nread < 0) + { + if(errno != 0) + { + fprintf(stderr, "cut: Error while reading file '%s': %s\n", filename, strerror(errno)); + err = 1; + } + break; + } + + if(nread == 0) + { + fputc('\n', stdout); + continue; + } + + if(line[nread - 1] == '\n') line[--nread] = '\0'; + + if(opt_s && strchr(line, delim) == NULL) continue; + + bool need_sep = false; + size_t i = 0; + char *toks = NULL; + for(char *c = strtok_r(line, &delim, &toks); c != NULL; c = strtok_r(NULL, &delim, &toks), i++) + { + if(list[i]) + { + if(need_sep) fputc(delim, stdout); + + fputs(c, stdout); + need_sep = true; + } + } + + fputc('\n', stdout); + } + + if(line_len != 0) free(line); + + return 0; +} + +static int +cut(FILE *in, char *filename) +{ + switch(mode) + { + case CUT_MODE_NONE: + fprintf(stderr, "cut: No action (-b, -c, -f) specified\n"); + return 1; + case CUT_MODE_B: + return cut_b(in, filename); + case CUT_MODE_C: + return cut_c(in, filename); + case CUT_MODE_F: + return cut_f(in, filename); + default: + abort(); + } +} + +int +main(int argc, char *argv[]) +{ + char *opt_list = NULL; + + errno = 0; + setlocale(LC_ALL, ""); + if(errno != 0) + { + fprintf(stderr, "date: Warning: Failed to initialize locales: %s\n", strerror(errno)); + errno = 0; + } + + int c = -1; + while((c = getopt(argc, argv, ":b:c:d:f:ns")) != -1) + { + switch(c) + { + case 'b': + if(opt_list != NULL) + { + fprintf(stderr, "cut: Only one list may be specified\n"); + return 1; + } + mode = CUT_MODE_B; + opt_list = optarg; + break; + case 'c': + if(opt_list != NULL) + { + fprintf(stderr, "cut: Only one list may be specified\n"); + return 1; + } + mode = CUT_MODE_C; + opt_list = optarg; + break; + case 'f': + if(opt_list != NULL) + { + fprintf(stderr, "cut: Only one list may be specified\n"); + return 1; + } + mode = CUT_MODE_F; + opt_list = optarg; + break; + case 'd': + if(optarg[0] == '\0') + { + fprintf(stderr, "cut: Option '-d' requires a character, got an empty string\n"); + return 1; + } + if(optarg[1] != '\0') + { + fprintf(stderr, "cut: Option '-d' only accepts single characters, got \"%s\"\n", optarg); + return 1; + } + delim = optarg[0]; + break; + case 'n': + opt_n = true; + break; + case 's': + opt_s = true; + break; + case ':': + fprintf(stderr, "cut: Option '-%c' requires an operand\n", optopt); + return 1; + default: + fprintf(stderr, "cut: Unhandled option '-%c'\n", optopt); + return 1; + } + } + + argc -= optind; + argv += optind; + + if(mode == CUT_MODE_NONE) + { + fprintf(stderr, "cut: No action (-b, -c, -f) specified\n"); + return 1; + } + + if(parse_list(opt_list) < 0) return 1; + +#if 0 + fprintf(stderr, "[DEBUG] list: "); + for(size_t i = 0; i < list_len; i++) + { + fputc(list[i] ? '1' : '0', stderr); + } + fputc('\n', stderr); +#endif + + if(argc <= 0) return cut(stdin, "<stdin>"); + + for(int i = 0; i < argc; i++) + { + FILE *in = fopen(argv[i], "r"); + if(in == NULL) + { + fprintf(stderr, "cut: Failed opening file '%s': %s\n", argv[i], strerror(errno)); + return 1; + } + + int ret = cut(in, argv[i]); + + if(fclose(in) < 0) + { + fprintf(stderr, "cut: Failed closing file '%s': %s\n", argv[i], strerror(errno)); + return 1; + } + + if(ret != 0) return 1; + } + + return 0; +} diff --git a/coreutils.txt b/coreutils.txt @@ -17,7 +17,7 @@ cksum: Done comm: Maybe, POSIX cp: Todo csplit: No. Considered obsolete -cut: Todo +cut: Done date: Done dd: No. non-Unix df: Done diff --git a/lsb_commands.txt b/lsb_commands.txt @@ -21,7 +21,7 @@ cp: Todo cpio: out of scope crontab: out of scope csplit: No. Considered obsolete -cut: Todo +cut: Done date: Done dd: No. non-Unix df: Done diff --git a/posix_utilities.txt b/posix_utilities.txt @@ -30,7 +30,7 @@ cp crontab: no, external csplit ctags: no, toolchain XOPEN_UNIX -cut +cut: done cxref: no, toolchain XOPEN_UNIX date: done dd diff --git a/test-cmd/Kyuafile b/test-cmd/Kyuafile @@ -17,6 +17,7 @@ tap_test_program{name="cat.sh", required_files=basedir.."/cmd/cat", timeout=1} tap_test_program{name="chmod.sh", required_files=basedir.."/cmd/chmod", timeout=1} tap_test_program{name="cksum.sh", required_files=basedir.."/cmd/cksum", timeout=1} tap_test_program{name="cmp.sh", required_files=basedir.."/cmd/cmp", timeout=1} +tap_test_program{name="cut.sh", required_files=basedir.."/cmd/cut", timeout=1} tap_test_program{name="date.sh", required_files=basedir.."/cmd/date", timeout=1} tap_test_program{name="dirname.sh", required_files=basedir.."/cmd/dirname", timeout=1} tap_test_program{name="echo.sh", required_files=basedir.."/cmd/echo", timeout=1} diff --git a/test-cmd/cut.sh b/test-cmd/cut.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +# SPDX-License-Identifier: MPL-2.0 + +WD="$(dirname "$0")/../" +target="${WD}/cmd/cut" +plans=6 +. "${WD}/test-cmd/tap.sh" + +t 'bytes:2-3,10-20,4,12' "-b 2-3,10-20,4,12 ${WD}/test-cmd/inputs/alnum" '1239ABCDEFGHIJ +' +t --exit=1 'bytes:,' "-b , ${WD}/test-cmd/inputs/alnum" 'cut: Error: empty list element +' +# Example taken from POSIX cut(1) +t --input='abcdefghi' 'chars:6,2,4-7,1' '-c 6,2,4-7,1' 'abdefg +' + +t --input='aéb' 'widechar' '-c2' 'é +' + +fields='1 2 3 4 + +a b c' + +t --input="$fields" f2 '-f2' '2 + +b +' + +t --input="$fields" s_f2 '-s -f2' '2 +b +'