commit: 6640362d9df83900ffc23d7d966daaa611ad9fa5
parent 6d4cb0a82ed964934626d1364dc6445641fb4834
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date: Wed, 24 Apr 2024 13:15:41 +0200
cmd/wc: new
Diffstat:
A | cmd/wc.1 | 73 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | cmd/wc.c | 255 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | makeless.sh | 1 | + |
A | test-cmd/wc.t | 42 | ++++++++++++++++++++++++++++++++++++++++++ |
4 files changed, 371 insertions(+), 0 deletions(-)
diff --git a/cmd/wc.1 b/cmd/wc.1
@@ -0,0 +1,73 @@
+.\" utils-std: Collection of commonly available Unix tools
+.\" Copyright 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+.\" SPDX-License-Identifier: MPL-2.0
+.Dd 2024-04-24
+.Dt WC 1
+.Os
+.Sh NAME
+.Nm wc
+.Nd Count lines, words, bytes/characters
+.Sh SYNOPSIS
+.Nm
+.Op Fl c Ns | Ns Fl m
+.Op Fl lw
+.Op Ar file...
+.Sh DESCRIPTION
+.Nm
+reads each given
+.Ar file
+and by default report their numbers of newlines, words and bytes.
+If no
+.Ar file
+is given, then
+.Nm
+reads from standard input.
+.Pp
+A word is defined as a non-empty string delimited by whitespace,
+some other implementation choose to additionally exclude
+non-printable characters.
+.Sh OPTIONS
+.Bl -tag -width Ds
+.It Fl c
+Explicitly use single-byte mode, and write the number of bytes in each
+.Ar file .
+.It Fl l
+Write the number of newlines in each
+.Ar file .
+.It Fl m
+Switch to multibyte-characters mode, and write the number of characters in each
+.Ar file .
+The particular encoding is dependent on your locale.
+.It Fl w
+Write the number of words in each
+.Ar file .
+.El
+.Pp
+If any option is specified,
+.Nm
+reports only the requested information, without their ordering
+changing output formatting.
+The default is equivalent to
+.Cm wc
+.Fl clw .
+.Sh ENVIRONMENT VARIABLES
+See
+.Xr locale 1 .
+.Sh STDOUT
+By default the standard output reports each file in the form:
+.Bd -literal
+"%d %d %d %s", <newlines>, <words>, <bytes>, <file>
+.Ed
+.Pp
+If more than one
+.Ar file
+is given, a final line is printed with "total" instead of a pathname.
+.Sh EXIT STATUS
+.Ex -std
+.Sh STANDARDS
+.Nm
+should be compliant with the
+.St -p1003.1-2008
+specification.
+.Sh AUTHORS
+.An Haelwenn (lanodan) Monnier Aq Mt contact+utils@hacktivis.me
diff --git a/cmd/wc.c b/cmd/wc.c
@@ -0,0 +1,255 @@
+// utils-std: Collection of commonly available Unix tools
+// SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+// SPDX-License-Identifier: MPL-2.0
+
+#define _POSIX_C_SOURCE 200809L
+#include "../lib/bitmasks.h"
+
+#include <assert.h>
+#include <ctype.h> // isspace
+#include <errno.h>
+#include <locale.h> // setlocale
+#include <stdbool.h>
+#include <stdint.h> // uint8_t
+#include <stdio.h> // fprintf, fopen
+#include <stdlib.h> // abort
+#include <string.h> // strchr, strerror
+#include <sys/stat.h>
+#include <unistd.h> // getopt
+#include <wchar.h>
+#include <wctype.h> // iswspace
+
+static char *argv0 = "wc";
+
+static enum {
+ WC_OPT_C = 1 << 0,
+ WC_OPT_L = 1 << 1,
+ WC_OPT_W = 1 << 3,
+ WC_OPT_ALL = WC_OPT_C | WC_OPT_L | WC_OPT_W,
+} wc_opts = 0;
+
+off_t total_bytes = 0, total_lines = 0, total_words = 0;
+
+static int
+wc_file_bytes(FILE *file, char *filename)
+{
+ off_t bytes = 0, lines = 0, words = 0, wordlen = 0;
+
+ while(true)
+ {
+ assert(errno == 0);
+ int c = getc(file);
+ if(c == EOF)
+ {
+ if(errno != 0)
+ {
+ fprintf(stderr,
+ "%s: Failed reading from file '%s': %s\n",
+ argv0,
+ filename != NULL ? filename : "<stdin>",
+ strerror(errno));
+ return -1;
+ }
+ break;
+ }
+ bytes++;
+
+ if(c == '\n') lines++;
+
+ if(isspace(c))
+ {
+ if(wordlen > 0)
+ {
+ words++;
+ wordlen = 0;
+ }
+ }
+ else
+ {
+ wordlen++;
+ }
+ }
+
+ if(wordlen > 0) words++;
+
+ total_bytes += bytes, total_lines += lines, total_words += words;
+
+ if(FIELD_MATCH(wc_opts, WC_OPT_L)) printf("%ld ", lines);
+ if(FIELD_MATCH(wc_opts, WC_OPT_W)) printf("%ld ", words);
+ if(FIELD_MATCH(wc_opts, WC_OPT_C)) printf("%ld ", bytes);
+ if(filename != NULL) printf("%s", filename);
+ printf("\n");
+
+ return 0;
+}
+
+static int
+wc_file_chars(FILE *file, char *filename)
+{
+ off_t chars = 0, lines = 0, words = 0, wordlen = 0;
+
+ while(true)
+ {
+ assert(errno == 0);
+ wint_t c = getwc(file);
+ if(c == WEOF)
+ {
+ if(errno != 0)
+ {
+ fprintf(stderr,
+ "%s: Failed reading from file '%s': %s\n",
+ argv0,
+ filename != NULL ? filename : "<stdin>",
+ strerror(errno));
+ return -1;
+ }
+ break;
+ }
+ chars++;
+
+ if(c == '\n') lines++;
+
+ if(iswspace(c))
+ {
+ if(wordlen > 0)
+ {
+ words++;
+ wordlen = 0;
+ }
+ }
+ else
+ {
+ wordlen++;
+ }
+ }
+
+ if(wordlen > 0) words++;
+
+ if(FIELD_MATCH(wc_opts, WC_OPT_L)) printf("%ld ", lines);
+ if(FIELD_MATCH(wc_opts, WC_OPT_W)) printf("%ld ", words);
+ if(FIELD_MATCH(wc_opts, WC_OPT_C)) printf("%ld ", chars);
+ if(filename != NULL) printf("%s", filename);
+ printf("\n");
+
+ return 0;
+}
+
+static void
+usage()
+{
+ fprintf(stderr, "Usage: wc [-c|-m] [-lw] [file...]\n");
+}
+
+int
+main(int argc, char *argv[])
+{
+ errno = 0;
+ setlocale(LC_ALL, "");
+ if(errno != 0)
+ {
+ fprintf(stderr, "%s: Warning: Failed to initialize locales: %s\n", argv0, strerror(errno));
+ errno = 0;
+ }
+ int (*wc_file)(FILE *, char *) = &wc_file_bytes;
+
+ int c = -1;
+ while((c = getopt(argc, argv, ":clmw")) != -1)
+ {
+ switch(c)
+ {
+ case 'c':
+ wc_opts |= WC_OPT_C;
+ wc_file = wc_file_bytes;
+ break;
+ case 'l':
+ wc_opts |= WC_OPT_L;
+ break;
+ case 'm':
+ wc_opts |= WC_OPT_C;
+ wc_file = wc_file_chars;
+ break;
+ case 'w':
+ wc_opts |= WC_OPT_W;
+ break;
+ case ':':
+ fprintf(stderr, "%s: Error: Missing operand for option: '-%c'\n", argv0, optopt);
+ usage();
+ return 1;
+ case '?':
+ fprintf(stderr, "%s: Error: Unrecognised option: '-%c'\n", argv0, optopt);
+ usage();
+ return 1;
+ default:
+ abort();
+ }
+ }
+ if(wc_opts == 0) wc_opts = WC_OPT_ALL;
+
+ assert(errno == 0);
+
+ argc -= optind;
+ argv += optind;
+
+ if(argc < 1)
+ {
+ if(wc_file(stdin, NULL) < 0) return 1;
+ }
+
+ for(int i = 0; i < argc; i++)
+ {
+ char *path = argv[i];
+ if(path[0] == '-' && path[1] == 0)
+ {
+ if(wc_file(stdin, NULL) < 0) return 1;
+
+ continue;
+ }
+
+ // https://www.austingroupbugs.net/view.php?id=251
+ if(strchr(path, '\n') != NULL)
+ fprintf(
+ stderr,
+ "%s: Warning: Filename '%s' contains a newline while wc(1) uses newlines as separators\n",
+ argv0,
+ path);
+
+ if(wc_opts == WC_OPT_C && wc_file == wc_file_bytes)
+ {
+ struct stat status;
+ if(stat(path, &status) < 0)
+ {
+ fprintf(
+ stderr, "%s: Failed getting status for file '%s': %s\n", argv0, path, strerror(errno));
+ return 1;
+ }
+
+ printf("%ld %s\n", status.st_size, path);
+ continue;
+ }
+
+ FILE *file = fopen(path, "r");
+ if(file == NULL)
+ {
+ fprintf(stderr, "%s: Failed opening file '%s': %s\n", argv0, path, strerror(errno));
+ return 1;
+ }
+
+ if(wc_file(file, path) < 0) return 1;
+
+ if(fclose(file) < 0)
+ {
+ fprintf(stderr, "%s: Failed closing file '%s': %s\n", argv0, path, strerror(errno));
+ return 1;
+ }
+ }
+
+ if(argc > 1)
+ {
+ if(FIELD_MATCH(wc_opts, WC_OPT_L)) printf("%ld ", total_lines);
+ if(FIELD_MATCH(wc_opts, WC_OPT_W)) printf("%ld ", total_words);
+ if(FIELD_MATCH(wc_opts, WC_OPT_C)) printf("%ld ", total_bytes);
+ printf("total\n");
+ }
+
+ return 0;
+}
diff --git a/makeless.sh b/makeless.sh
@@ -49,5 +49,6 @@ $CC -std=c99 $CFLAGS -o cmd/truncate cmd/truncate.c lib/truncation.c $LDFLAGS $L
$CC -std=c99 $CFLAGS -o cmd/tty cmd/tty.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/uname cmd/uname.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/unlink cmd/unlink.c $LDFLAGS $LDSTATIC
+$CC -std=c99 $CFLAGS -o cmd/wc cmd/wc.c $LDFLAGS $LDSTATIC
$M4 cmd/date.1.in > build/cmd/date.1
$M4 cmd/touch.1.in > build/cmd/touch.1
diff --git a/test-cmd/wc.t b/test-cmd/wc.t
@@ -0,0 +1,42 @@
+#!/usr/bin/env cram
+# SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+# SPDX-License-Identifier: MPL-2.0
+
+ $ export PATH="$TESTDIR/../cmd:$PATH"
+
+ $ test "$(command -v wc)" = "$TESTDIR/../cmd/wc"
+
+ $ export LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+ $ printf 'a b c' | wc
+ \s*0\s*3\s*5\s* (re)
+
+ $ printf 'a\nb\nc\n' | wc
+ \s*3\s*3\s*6\s* (re)
+ $ printf 'a\nb\nc\n' | wc -clw
+ \s*3\s*3\s*6\s* (re)
+
+ $ printf 'a\nbb\nccc\n' | wc
+ \s*3\s*3\s*9\s* (re)
+ $ printf 'a\nbb\nccc\n' | wc -clw
+ \s*3\s*3\s*9\s* (re)
+
+ $ printf 'я нет 草' | wc -m
+ \s*7\s* (re)
+ $ printf 'я нет 草' | wc -mw
+ \s*3\s*7\s* (re)
+ $ printf 'я нет 草' | wc -mlw
+ \s*0\s*3\s*7\s* (re)
+
+ $ wc <&-
+ wc: Failed reading from file '<stdin>': Bad file descriptor
+ [1]
+ $ wc -m <&-
+ wc: Failed reading from file '<stdin>': Bad file descriptor
+ [1]
+
+ $ wc /dev/null
+ 0 0 0 /dev/null
+ $ wc /var/empty/e/no/ent
+ wc: Failed opening file '/var/empty/e/no/ent': No such file or directory
+ [1]