commit: 41779a4a71e989cf806b1e39f4d35f27f0432be3
parent 8336707cacd804ab7c234f3b3e416af37aff5c78
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date: Thu, 28 Mar 2024 18:35:38 +0100
cmd/uniq: new
Diffstat:
5 files changed, 332 insertions(+), 0 deletions(-)
diff --git a/cmd/uniq.1 b/cmd/uniq.1
@@ -0,0 +1,54 @@
+.\" utils-std: Collection of commonly available Unix tools
+.\" Copyright 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+.\" SPDX-License-Identifier: MPL-2.0
+.Dd 2024-05-02
+.Dt UNIQ 1
+.Os
+.Sh NAME
+.Nm uniq
+.Nd report or filter out duplicated lines
+.Sh SYNOPSIS
+.Nm
+.Op Fl c Ns | Ns Fl d Ns | Ns Fl u
+.Op Fl f Ar field_shift
+.Op Fl s Ar byte_shift
+.Op Ar input_file Op Ar output_file
+.Sh DESCRIPTION
+The
+.Nm
+utility reads
+.Ar file1
+and
+.Ar file2
+line-by-line and by default filters out repeating lines.
+.Sh OPTIONS
+The following options are supported:
+.Bl -tag -width Ds
+.It Fl c
+Prefix lines with how many times they are duplicated.
+.It Fl d
+Only write duplicated lines (count > 1).
+.It Fl f Ar field_shift
+Shift the compared line by
+.Ar field_shift
+amount of fields, where a field is
+.Aq blank
+separated.
+.It Fl s Ar byte_shift
+Shift the compared lines by
+.Ar byte_shift .
+If
+.Ar field_shift
+was also specified, it provides an additional shift.
+.It Fl u
+Only write unique lines (count == 1).
+.El
+.Sh EXIT STATUS
+.Ex -std
+.Sh STANDARDS
+.Nm
+should be compliant with the
+.St -p1003.1-2008
+specification.
+.Sh AUTHORS
+.An Haelwenn (lanodan) Monnier Aq Mt contact+utils@hacktivis.me
diff --git a/cmd/uniq.c b/cmd/uniq.c
@@ -0,0 +1,243 @@
+// utils-std: Collection of commonly available Unix tools
+// SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+// SPDX-License-Identifier: MPL-2.0
+
+#define _POSIX_C_SOURCE 200809L
+
+#include <assert.h>
+#include <ctype.h> // isblank
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h> // getline
+#include <stdlib.h> // atoi
+#include <string.h> // strncmp
+#include <unistd.h> // getopt
+
+enum uniq_mode
+{
+ UNIQ, // default
+ COUNT,
+ ONLY_REPEAT,
+ NO_REPEAT,
+};
+
+int
+main(int argc, char *argv[])
+{
+ enum uniq_mode mode = UNIQ;
+ unsigned long field = 0, shift = 0;
+
+ char *endptr = NULL;
+
+ int c = -1;
+ while((c = getopt(argc, argv, ":cdf:s:u")) != -1)
+ {
+ switch(c)
+ {
+ case 'c':
+ if(mode != UNIQ)
+ {
+ fprintf(stderr, "uniq: Error: can only pass one of [-c|-d|-u]\n");
+ return 1;
+ }
+ mode = COUNT;
+ break;
+ case 'd':
+ if(mode != UNIQ)
+ {
+ fprintf(stderr, "uniq: Error: can only pass one of [-c|-d|-u]\n");
+ return 1;
+ }
+ mode = ONLY_REPEAT;
+ break;
+ case 'f':
+ errno = 0;
+ field = strtoul(optarg, &endptr, 0);
+ if(errno != 0)
+ {
+ fprintf(stderr, "uniq: Error: Failed parsing '-f %s': %s\n", optarg, strerror(errno));
+ return 1;
+ }
+ if(endptr != NULL && endptr[0] != 0)
+ {
+ fprintf(
+ stderr, "uniq: Error: Non-numeric characters passed to '-f %s': %s\n", optarg, endptr);
+ return 1;
+ }
+ break;
+ case 's':
+ errno = 0;
+ shift = strtoul(optarg, &endptr, 0);
+ if(errno != 0)
+ {
+ fprintf(stderr, "uniq: Error: Failed parsing '-f %s': %s\n", optarg, strerror(errno));
+ return 1;
+ }
+ if(endptr != NULL && endptr[0] != 0)
+ {
+ fprintf(
+ stderr, "uniq: Error: Non-numeric characters passed to '-f %s': %s\n", optarg, endptr);
+ return 1;
+ }
+ break;
+ case 'u':
+ if(mode != UNIQ)
+ {
+ fprintf(stderr, "uniq: Error: can only pass one of [-c|-d|-u]\n");
+ return 1;
+ }
+ mode = NO_REPEAT;
+ break;
+ default:
+ abort();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ assert(errno == 0);
+
+ FILE *input = stdin;
+ FILE *output = stdout;
+
+ switch(argc)
+ {
+ case 0:
+ break;
+ case 1:
+ input = fopen(argv[0], "r");
+ if(input == NULL)
+ {
+ fprintf(stderr, "uniq: Failed opening input file '%s': %s\n", argv[0], strerror(errno));
+ return 1;
+ }
+ break;
+ case 2:
+ input = fopen(argv[0], "r");
+ if(input == NULL)
+ {
+ fprintf(stderr, "uniq: Failed opening input file '%s': %s\n", argv[0], strerror(errno));
+ return 1;
+ }
+ output = fopen(argv[1], "w");
+ if(output == NULL)
+ {
+ fprintf(stderr, "uniq: Failed opening output file '%s': %s\n", argv[1], strerror(errno));
+ return 1;
+ }
+ break;
+ default:
+ fprintf(stderr, "uniq: Invalid number of arguments (%d), expected [0..2]\n", argc);
+ return 1;
+ }
+
+ assert(errno == 0);
+
+ char *first = NULL;
+ ssize_t first_len = 0;
+ size_t first_shift = 0;
+ unsigned counter = 1;
+
+ errno = 0;
+ while(true)
+ {
+ assert(errno == 0);
+ char *cur = NULL;
+ size_t cur_size = 0;
+ ssize_t cur_len = getline(&cur, &cur_size, input);
+
+ size_t cur_shift = shift;
+
+ if(cur[cur_len - 1] == '\n')
+ {
+ cur[cur_len - 1] = 0;
+ cur_len--;
+ }
+
+ if(field != 0)
+ {
+ ssize_t field_shift = 0;
+
+ for(unsigned long i = 0; i < field; i++)
+ {
+ while(field_shift < cur_len && isblank(cur[field_shift]))
+ field_shift++;
+
+ while(field_shift < cur_len && !isblank(cur[field_shift]))
+ field_shift++;
+ }
+
+ cur_shift += field_shift;
+ }
+
+ if(cur_shift > cur_len)
+ {
+ free(cur);
+ cur_size = 0;
+ cur = NULL;
+ break;
+ }
+
+ //fprintf(stderr, "[debug] {cur_shift:%d} <%s>\n", cur_shift, cur+cur_shift);
+
+ if(first != NULL)
+ {
+ if(cur != NULL && (cur_len - cur_shift == first_len - first_shift) &&
+ strncmp(cur + cur_shift, first + first_shift, cur_len - cur_shift) == 0)
+ {
+ counter += 1;
+ }
+ else
+ {
+ switch(mode)
+ {
+ case UNIQ:
+ fwrite(first, first_len, 1, output);
+ fprintf(output, "\n");
+ break;
+ case ONLY_REPEAT:
+ if(counter > 1)
+ {
+ fwrite(first, first_len, 1, output);
+ fprintf(output, "\n");
+ }
+ break;
+ case NO_REPEAT:
+ if(counter == 1)
+ {
+ fwrite(first, first_len, 1, output);
+ fprintf(output, "\n");
+ }
+ break;
+ case COUNT:
+ fprintf(output, "%d %s\n", counter, first);
+ break;
+ }
+ counter = 1;
+ free(first);
+ }
+ }
+
+ if(cur_len < 0)
+ {
+ free(cur);
+ break;
+ }
+
+ if(counter == 1)
+ {
+ first = cur;
+ first_len = cur_len;
+ first_shift = cur_shift;
+ }
+ }
+
+ if(errno != 0)
+ {
+ fprintf(stderr, "uniq: Read error: %s\n", strerror(errno));
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/makeless.sh b/makeless.sh
@@ -54,6 +54,7 @@ $CC -std=c99 $CFLAGS -o cmd/true cmd/true.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/truncate cmd/truncate.c lib/truncation.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/tty cmd/tty.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/uname cmd/uname.c $LDFLAGS $LDSTATIC
+$CC -std=c99 $CFLAGS -o cmd/uniq cmd/uniq.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/unlink cmd/unlink.c $LDFLAGS $LDSTATIC
$CC -std=c99 $CFLAGS -o cmd/wc cmd/wc.c $LDFLAGS $LDSTATIC
$YACC -b cmd/expr cmd/expr.y
diff --git a/test-cmd/uniq_0I.txt b/test-cmd/uniq_0I.txt
@@ -0,0 +1,7 @@
+#01 foo0 bar0 foo1 bar1
+#02 bar0 foo1 bar1 foo1
+#03 foo0 bar0 foo1 bar1
+#04
+#05 foo0 bar0 foo1 bar1
+#06 foo0 bar0 foo1 bar1
+#07 bar0 foo1 bar1 foo0
diff --git a/test-cmd/uniq_posix.t b/test-cmd/uniq_posix.t
@@ -0,0 +1,27 @@
+#!/usr/bin/env cram
+# SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+# SPDX-License-Identifier: MPL-2.0
+
+ $ export PATH="$TESTDIR/../cmd:$PATH"
+
+ $ test "$(command -v uniq)" = "$TESTDIR/../cmd/uniq"
+
+ $ uniq -c -f 1 "${TESTDIR}/uniq_0I.txt"
+ \s*1 #01 foo0 bar0 foo1 bar1 (re)
+ \s*1 #02 bar0 foo1 bar1 foo1 (re)
+ \s*1 #03 foo0 bar0 foo1 bar1 (re)
+ \s*1 #04 (re)
+ \s*2 #05 foo0 bar0 foo1 bar1 (re)
+ \s*1 #07 bar0 foo1 bar1 foo0 (re)
+
+ $ uniq -d -f 1 "${TESTDIR}/uniq_0I.txt"
+ #05 foo0 bar0 foo1 bar1
+
+ $ uniq -u -f 1 "${TESTDIR}/uniq_0I.txt"
+ #01 foo0 bar0 foo1 bar1
+ #02 bar0 foo1 bar1 foo1
+ #03 foo0 bar0 foo1 bar1
+ #04
+ #07 bar0 foo1 bar1 foo0
+
+ $ uniq -d -s 2 "${TESTDIR}/uniq_0I.txt"