logo

utils-std

Collection of commonly available Unix tools
commit: 6640362d9df83900ffc23d7d966daaa611ad9fa5
parent 6d4cb0a82ed964934626d1364dc6445641fb4834
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Wed, 24 Apr 2024 13:15:41 +0200

cmd/wc: new

Diffstat:

Acmd/wc.173+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acmd/wc.c255+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmakeless.sh1+
Atest-cmd/wc.t42++++++++++++++++++++++++++++++++++++++++++
4 files changed, 371 insertions(+), 0 deletions(-)

diff --git a/cmd/wc.1 b/cmd/wc.1 @@ -0,0 +1,73 @@ +.\" utils-std: Collection of commonly available Unix tools +.\" Copyright 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +.\" SPDX-License-Identifier: MPL-2.0 +.Dd 2024-04-24 +.Dt WC 1 +.Os +.Sh NAME +.Nm wc +.Nd Count lines, words, bytes/characters +.Sh SYNOPSIS +.Nm +.Op Fl c Ns | Ns Fl m +.Op Fl lw +.Op Ar file... +.Sh DESCRIPTION +.Nm +reads each given +.Ar file +and by default report their numbers of newlines, words and bytes. +If no +.Ar file +is given, then +.Nm +reads from standard input. +.Pp +A word is defined as a non-empty string delimited by whitespace, +some other implementation choose to additionally exclude +non-printable characters. +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl c +Explicitly use single-byte mode, and write the number of bytes in each +.Ar file . +.It Fl l +Write the number of newlines in each +.Ar file . +.It Fl m +Switch to multibyte-characters mode, and write the number of characters in each +.Ar file . +The particular encoding is dependent on your locale. +.It Fl w +Write the number of words in each +.Ar file . +.El +.Pp +If any option is specified, +.Nm +reports only the requested information, without their ordering +changing output formatting. +The default is equivalent to +.Cm wc +.Fl clw . +.Sh ENVIRONMENT VARIABLES +See +.Xr locale 1 . +.Sh STDOUT +By default the standard output reports each file in the form: +.Bd -literal +"%d %d %d %s", <newlines>, <words>, <bytes>, <file> +.Ed +.Pp +If more than one +.Ar file +is given, a final line is printed with "total" instead of a pathname. +.Sh EXIT STATUS +.Ex -std +.Sh STANDARDS +.Nm +should be compliant with the +.St -p1003.1-2008 +specification. +.Sh AUTHORS +.An Haelwenn (lanodan) Monnier Aq Mt contact+utils@hacktivis.me diff --git a/cmd/wc.c b/cmd/wc.c @@ -0,0 +1,255 @@ +// utils-std: Collection of commonly available Unix tools +// SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +// SPDX-License-Identifier: MPL-2.0 + +#define _POSIX_C_SOURCE 200809L +#include "../lib/bitmasks.h" + +#include <assert.h> +#include <ctype.h> // isspace +#include <errno.h> +#include <locale.h> // setlocale +#include <stdbool.h> +#include <stdint.h> // uint8_t +#include <stdio.h> // fprintf, fopen +#include <stdlib.h> // abort +#include <string.h> // strchr, strerror +#include <sys/stat.h> +#include <unistd.h> // getopt +#include <wchar.h> +#include <wctype.h> // iswspace + +static char *argv0 = "wc"; + +static enum { + WC_OPT_C = 1 << 0, + WC_OPT_L = 1 << 1, + WC_OPT_W = 1 << 3, + WC_OPT_ALL = WC_OPT_C | WC_OPT_L | WC_OPT_W, +} wc_opts = 0; + +off_t total_bytes = 0, total_lines = 0, total_words = 0; + +static int +wc_file_bytes(FILE *file, char *filename) +{ + off_t bytes = 0, lines = 0, words = 0, wordlen = 0; + + while(true) + { + assert(errno == 0); + int c = getc(file); + if(c == EOF) + { + if(errno != 0) + { + fprintf(stderr, + "%s: Failed reading from file '%s': %s\n", + argv0, + filename != NULL ? filename : "<stdin>", + strerror(errno)); + return -1; + } + break; + } + bytes++; + + if(c == '\n') lines++; + + if(isspace(c)) + { + if(wordlen > 0) + { + words++; + wordlen = 0; + } + } + else + { + wordlen++; + } + } + + if(wordlen > 0) words++; + + total_bytes += bytes, total_lines += lines, total_words += words; + + if(FIELD_MATCH(wc_opts, WC_OPT_L)) printf("%ld ", lines); + if(FIELD_MATCH(wc_opts, WC_OPT_W)) printf("%ld ", words); + if(FIELD_MATCH(wc_opts, WC_OPT_C)) printf("%ld ", bytes); + if(filename != NULL) printf("%s", filename); + printf("\n"); + + return 0; +} + +static int +wc_file_chars(FILE *file, char *filename) +{ + off_t chars = 0, lines = 0, words = 0, wordlen = 0; + + while(true) + { + assert(errno == 0); + wint_t c = getwc(file); + if(c == WEOF) + { + if(errno != 0) + { + fprintf(stderr, + "%s: Failed reading from file '%s': %s\n", + argv0, + filename != NULL ? filename : "<stdin>", + strerror(errno)); + return -1; + } + break; + } + chars++; + + if(c == '\n') lines++; + + if(iswspace(c)) + { + if(wordlen > 0) + { + words++; + wordlen = 0; + } + } + else + { + wordlen++; + } + } + + if(wordlen > 0) words++; + + if(FIELD_MATCH(wc_opts, WC_OPT_L)) printf("%ld ", lines); + if(FIELD_MATCH(wc_opts, WC_OPT_W)) printf("%ld ", words); + if(FIELD_MATCH(wc_opts, WC_OPT_C)) printf("%ld ", chars); + if(filename != NULL) printf("%s", filename); + printf("\n"); + + return 0; +} + +static void +usage() +{ + fprintf(stderr, "Usage: wc [-c|-m] [-lw] [file...]\n"); +} + +int +main(int argc, char *argv[]) +{ + errno = 0; + setlocale(LC_ALL, ""); + if(errno != 0) + { + fprintf(stderr, "%s: Warning: Failed to initialize locales: %s\n", argv0, strerror(errno)); + errno = 0; + } + int (*wc_file)(FILE *, char *) = &wc_file_bytes; + + int c = -1; + while((c = getopt(argc, argv, ":clmw")) != -1) + { + switch(c) + { + case 'c': + wc_opts |= WC_OPT_C; + wc_file = wc_file_bytes; + break; + case 'l': + wc_opts |= WC_OPT_L; + break; + case 'm': + wc_opts |= WC_OPT_C; + wc_file = wc_file_chars; + break; + case 'w': + wc_opts |= WC_OPT_W; + break; + case ':': + fprintf(stderr, "%s: Error: Missing operand for option: '-%c'\n", argv0, optopt); + usage(); + return 1; + case '?': + fprintf(stderr, "%s: Error: Unrecognised option: '-%c'\n", argv0, optopt); + usage(); + return 1; + default: + abort(); + } + } + if(wc_opts == 0) wc_opts = WC_OPT_ALL; + + assert(errno == 0); + + argc -= optind; + argv += optind; + + if(argc < 1) + { + if(wc_file(stdin, NULL) < 0) return 1; + } + + for(int i = 0; i < argc; i++) + { + char *path = argv[i]; + if(path[0] == '-' && path[1] == 0) + { + if(wc_file(stdin, NULL) < 0) return 1; + + continue; + } + + // https://www.austingroupbugs.net/view.php?id=251 + if(strchr(path, '\n') != NULL) + fprintf( + stderr, + "%s: Warning: Filename '%s' contains a newline while wc(1) uses newlines as separators\n", + argv0, + path); + + if(wc_opts == WC_OPT_C && wc_file == wc_file_bytes) + { + struct stat status; + if(stat(path, &status) < 0) + { + fprintf( + stderr, "%s: Failed getting status for file '%s': %s\n", argv0, path, strerror(errno)); + return 1; + } + + printf("%ld %s\n", status.st_size, path); + continue; + } + + FILE *file = fopen(path, "r"); + if(file == NULL) + { + fprintf(stderr, "%s: Failed opening file '%s': %s\n", argv0, path, strerror(errno)); + return 1; + } + + if(wc_file(file, path) < 0) return 1; + + if(fclose(file) < 0) + { + fprintf(stderr, "%s: Failed closing file '%s': %s\n", argv0, path, strerror(errno)); + return 1; + } + } + + if(argc > 1) + { + if(FIELD_MATCH(wc_opts, WC_OPT_L)) printf("%ld ", total_lines); + if(FIELD_MATCH(wc_opts, WC_OPT_W)) printf("%ld ", total_words); + if(FIELD_MATCH(wc_opts, WC_OPT_C)) printf("%ld ", total_bytes); + printf("total\n"); + } + + return 0; +} diff --git a/makeless.sh b/makeless.sh @@ -49,5 +49,6 @@ $CC -std=c99 $CFLAGS -o cmd/truncate cmd/truncate.c lib/truncation.c $LDFLAGS $L $CC -std=c99 $CFLAGS -o cmd/tty cmd/tty.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/uname cmd/uname.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/unlink cmd/unlink.c $LDFLAGS $LDSTATIC +$CC -std=c99 $CFLAGS -o cmd/wc cmd/wc.c $LDFLAGS $LDSTATIC $M4 cmd/date.1.in > build/cmd/date.1 $M4 cmd/touch.1.in > build/cmd/touch.1 diff --git a/test-cmd/wc.t b/test-cmd/wc.t @@ -0,0 +1,42 @@ +#!/usr/bin/env cram +# SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +# SPDX-License-Identifier: MPL-2.0 + + $ export PATH="$TESTDIR/../cmd:$PATH" + + $ test "$(command -v wc)" = "$TESTDIR/../cmd/wc" + + $ export LANG=C.UTF-8 LC_ALL=C.UTF-8 + + $ printf 'a b c' | wc + \s*0\s*3\s*5\s* (re) + + $ printf 'a\nb\nc\n' | wc + \s*3\s*3\s*6\s* (re) + $ printf 'a\nb\nc\n' | wc -clw + \s*3\s*3\s*6\s* (re) + + $ printf 'a\nbb\nccc\n' | wc + \s*3\s*3\s*9\s* (re) + $ printf 'a\nbb\nccc\n' | wc -clw + \s*3\s*3\s*9\s* (re) + + $ printf 'я нет 草' | wc -m + \s*7\s* (re) + $ printf 'я нет 草' | wc -mw + \s*3\s*7\s* (re) + $ printf 'я нет 草' | wc -mlw + \s*0\s*3\s*7\s* (re) + + $ wc <&- + wc: Failed reading from file '<stdin>': Bad file descriptor + [1] + $ wc -m <&- + wc: Failed reading from file '<stdin>': Bad file descriptor + [1] + + $ wc /dev/null + 0 0 0 /dev/null + $ wc /var/empty/e/no/ent + wc: Failed opening file '/var/empty/e/no/ent': No such file or directory + [1]