logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git
commit: 4c21c817b83edddf81a3b34f42bba156062de792
parent 7b5f49a107df7eb8b0c8550de63da2fe993608c1
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Tue,  5 Nov 2024 20:37:15 +0100

cmd/wc: use fd for higher processing in bytes mode

As seen below it could still get some improvements but roughly,
'-c' got a 3× speedup and '-l' got a 2× speedup.

  $ uname -a
  Linux cloudchaser 6.6.38-gentoo #2 SMP PREEMPT_DYNAMIC Thu Aug 29 05:05:27 2024 x86_64
  $ grep -e 'model name' /proc/cpuinfo | head -n 1
  model name      : AMD Ryzen 5 PRO 3500U w/ Radeon Vega Mobile Gfx
  $ qfile -v /usr/bin/wc /opt/lanodan/bin/wc
  sys-apps/coreutils-9.5: /usr/bin/wc
  sys-apps/utils-std-9999: /opt/lanodan/bin/wc
  $ timeout 10 /usr/bin/yes | pv -trb > /dev/null
  12.2GiB 0:00:10 [1.22GiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | /opt/lanodan/bin/cat > /dev/null
  6.16GiB 0:00:09 [ 630MiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | /usr/bin/wc -c > /dev/null
  7.71GiB 0:00:09 [ 789MiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | /usr/bin/wc -l > /dev/null
  7.53GiB 0:00:09 [ 770MiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | /opt/lanodan/bin/wc -c > /dev/null
  2.40GiB 0:00:10 [ 245MiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | /opt/lanodan/bin/wc -l > /dev/null
  2.38GiB 0:00:09 [ 243MiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | ./cmd/wc -c > /dev/null
  7.44GiB 0:00:09 [ 761MiB/s]
  $ timeout 10 /usr/bin/yes | pv -trb | ./cmd/wc -l > /dev/null
  5.48GiB 0:00:09 [ 560MiB/s]

Diffstat:

Mcmd/wc.c124++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
1 file changed, 88 insertions(+), 36 deletions(-)

diff --git a/cmd/wc.c b/cmd/wc.c @@ -8,6 +8,7 @@ #include <assert.h> #include <ctype.h> // isspace #include <errno.h> +#include <fcntl.h> // posix_fadvise #include <locale.h> // setlocale #include <stdbool.h> #include <stdint.h> // uint8_t @@ -22,6 +23,9 @@ #include <getopt.h> #endif +#define WC_BUFSIZ 16320 +static char buf[WC_BUFSIZ] = ""; + static const char *argv0 = "wc"; static enum { @@ -57,43 +61,49 @@ print_counts(off_t lines, off_t words, off_t bytes, const char *filename) } static int -wc_file_bytes(FILE *file, char *filename) +wc_file_bytes(int fd, char *filename) { off_t bytes = 0, lines = 0, words = 0, wordlen = 0; - while(true) - { - assert(errno == 0); - int c = getc(file); - if(c == EOF) - { - if(errno != 0) - { - fprintf(stderr, - "%s: error: Failed reading from file '%s': %s\n", - argv0, - filename != NULL ? filename : "<stdin>", - strerror(errno)); - return -1; - } - break; - } - bytes++; + int lw = FIELD_MATCH(wc_opts, WC_OPT_L) || FIELD_MATCH(wc_opts, WC_OPT_W); - if(c == '\n') lines++; + ssize_t nread = -1; + while((nread = read(fd, buf, WC_BUFSIZ)) > 0) + { + bytes += nread; - if(isspace(c)) + if(lw) { - if(wordlen > 0) + for(ssize_t i = 0; i < nread; i++) { - words++; - wordlen = 0; + int c = buf[i]; + + if(c == '\n') lines++; + + if(isspace(c)) + { + if(wordlen > 0) + { + words++; + wordlen = 0; + } + } + else + { + wordlen++; + } } } - else - { - wordlen++; - } + } + + if(nread < 0 && errno != 0) + { + fprintf(stderr, + "%s: error: Failed reading from file '%s': %s\n", + argv0, + filename != NULL ? filename : "<stdin>", + strerror(errno)); + return -1; } if(wordlen > 0) words++; @@ -106,10 +116,30 @@ wc_file_bytes(FILE *file, char *filename) } static int -wc_file_chars(FILE *file, char *filename) +wc_file_chars(int fd, char *filename) { off_t chars = 0, lines = 0, words = 0, wordlen = 0; + FILE *file = fdopen(fd, "r"); + if(file == NULL) + { + fprintf(stderr, + "%s: error: Failed getting file stream for file '%s': %s\n", + argv0, + filename, + strerror(errno)); + return -1; + } + + if(setvbuf(file, buf, _IOFBF, WC_BUFSIZ) != 0) + { + fprintf(stderr, + "%s: warning: Failed setting a new buffer for <stdin>: %s\n", + argv0, + strerror(errno)); + errno = 0; + } + while(true) { assert(errno == 0); @@ -168,7 +198,7 @@ main(int argc, char *argv[]) fprintf(stderr, "%s: warning: Failed to initialize locales: %s\n", argv0, strerror(errno)); errno = 0; } - int (*wc_file)(FILE *, char *) = &wc_file_bytes; + int (*wc_file)(int, char *) = &wc_file_bytes; int c = -1; #ifdef HAS_GETOPT_LONG @@ -224,9 +254,21 @@ main(int argc, char *argv[]) argc -= optind; argv += optind; + if((errno = posix_fadvise(STDIN_FILENO, 0, 0, POSIX_FADV_SEQUENTIAL)) != 0) + { + if(errno != ESPIPE) + { + fprintf(stderr, + "%s: warning: Failure from posix_fadvise sequential for <stdin>: %s\n", + argv0, + strerror(errno)); + } + errno = 0; + } + if(argc < 1) { - if(wc_file(stdin, NULL) < 0) return 1; + if(wc_file(STDIN_FILENO, NULL) < 0) return 1; } for(int i = 0; i < argc; i++) @@ -234,7 +276,7 @@ main(int argc, char *argv[]) char *path = argv[i]; if(path[0] == '-' && path[1] == 0) { - if(wc_file(stdin, NULL) < 0) return 1; + if(wc_file(STDIN_FILENO, NULL) < 0) return 1; continue; } @@ -265,16 +307,26 @@ main(int argc, char *argv[]) continue; } - FILE *file = fopen(path, "r"); - if(file == NULL) + int arg_fd = open(path, O_RDONLY | O_NOCTTY); + if(arg_fd < 0) { fprintf(stderr, "%s: error: Failed opening file '%s': %s\n", argv0, path, strerror(errno)); return 1; } - if(wc_file(file, path) < 0) return 1; + if((errno = posix_fadvise(arg_fd, 0, 0, POSIX_FADV_SEQUENTIAL)) != 0) + { + fprintf(stderr, + "%s: warning: Failure from posix_fadvise sequential for file '%s': %s\n", + argv0, + path, + strerror(errno)); + errno = 0; + } + + if(wc_file(arg_fd, path) < 0) return 1; - if(fclose(file) < 0) + if(close(arg_fd) < 0) { fprintf(stderr, "%s: error: Failed closing file '%s': %s\n", argv0, path, strerror(errno)); return 1;