logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/
commit: 8aec98a517fdbbb0fc303e64ad5c7f61babdfd1b
parent 7c50cbab04faffa818b127dd3b55047dc01cfc4f
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Fri, 16 Jan 2026 23:50:47 +0100

cmd/cut: implement -n option

Fixes: https://todo.sr.ht/~lanodan/utils-std/4

Diffstat:

Mcmd/cut.15++---
Mcmd/cut.c60+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mtest-cmd/cut.sh11++++++++++-
3 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/cmd/cut.1 b/cmd/cut.1 @@ -74,7 +74,8 @@ if an empty string is passed, then NULL is used as separator. Cut fields based on .Ar list . .It Fl n -Do not split codepoints. (Currently unsupported in this implementation) +Prevent splitting codepoints. +If the last byte of a codepoint is part of the selection, it gets printed, otherwise it gets dropped. .It Fl s Suppress lines with no delimiter characters, otherwise whole delimiter-less lines are printed as-is. @@ -84,8 +85,6 @@ Use NULL as line separator instead of newline. .Sh EXIT STATUS .Ex -std .Sh STANDARDS -Except for the lack of support for -.Fl n , .Nm should be compliant with the IEEE Std 1003.1-2024 (“POSIX.1”) diff --git a/cmd/cut.c b/cmd/cut.c @@ -193,6 +193,64 @@ cut_b(FILE *in, const char *filename) } static int +cut_b_n(FILE *in, const char *filename) +{ + char *line = NULL; + size_t line_len = 0; + int err = 0; + + while(err == 0) + { + errno = 0; + ssize_t sread = getdelim(&line, &line_len, line_delim, in); + if(sread < 0) + { + if(errno != 0) + { + fprintf( + stderr, "%s: error: Failed reading file '%s': %s\n", argv0, filename, strerror(errno)); + err = 1; + } + break; + } + size_t nread = (size_t)sread; + + if(nread == 0) + { + fputc(line_delim, stdout); + continue; + } + + if(line[nread - 1] == '\n') line[nread--] = '\0'; + + for(size_t i = 0; i < MIN(list_len, nread); i++) + { + size_t isz = mbrlen(line + i, nread, NULL); + + if(isz == 0 || isz == (size_t)-2 || isz == (size_t)-1) continue; + + /* + * Check that last byte is part of the low-high selection. + * Per POSIX.1-2024 high and low only decrements, and otherwise drops the character. + */ + if(list[i + isz - 1]) + { + fwrite(line + i, isz, 1, stdout); + if(isz > 1) i += isz - 1; + } + } + + if(nostop && nread > list_len) fwrite(line + list_len, nread - list_len, 1, stdout); + + fputc(line_delim, stdout); + } + + free(line); + + return err; +} + +static int cut_c(FILE *in, const char *filename) { char *line = NULL; @@ -360,7 +418,7 @@ cut(FILE *in, const char *filename) fprintf(stderr, "%s: error: No action (-b, -c, -f) specified\n", argv0); return 1; case CUT_MODE_B: - return cut_b(in, filename); + return opt_n ? cut_b_n(in, filename) : cut_b(in, filename); case CUT_MODE_C: return cut_c(in, filename); case CUT_MODE_F: diff --git a/test-cmd/cut.sh b/test-cmd/cut.sh @@ -2,9 +2,9 @@ # SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> # SPDX-License-Identifier: MPL-2.0 +plans=18 WD="$(dirname "$0")/../" target="${WD}/cmd/cut" -plans=15 . "${WD}/test-cmd/tap.sh" . "${WD}/test-cmd/init_env.sh" @@ -33,6 +33,15 @@ t 'bytes:-10' "-b -10 ${WD}/test-cmd/inputs/alnum" '0123456789 ' t --exit=1 'bytes:,' "-b , ${WD}/test-cmd/inputs/alnum" 'cut: error: Empty list element ' +# The cafés tests assumes a UTF-8 environment +t --input='cafés' 'bytes:cafés[4]' '-b 4' ' +' +# per POSIX high only decrements so here it gets dropped +t --input='cafés' 'bytes:cafés[4]' '-b 4 -n' ' +' +t --input='cafés' 'bytes:cafés[5]' '-b 5 -n' 'é +' + # Example taken from POSIX cut(1) t --input='abcdefghi' 'chars:6,2,4-7,1' '-c 6,2,4-7,1' 'abdefg '