logo

utils-std

Collection of commonly available Unix tools
commit: 41779a4a71e989cf806b1e39f4d35f27f0432be3
parent 8336707cacd804ab7c234f3b3e416af37aff5c78
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date:   Thu, 28 Mar 2024 18:35:38 +0100

cmd/uniq: new

Diffstat:

Acmd/uniq.154++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acmd/uniq.c243+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmakeless.sh1+
Atest-cmd/uniq_0I.txt7+++++++
Atest-cmd/uniq_posix.t27+++++++++++++++++++++++++++
5 files changed, 332 insertions(+), 0 deletions(-)

diff --git a/cmd/uniq.1 b/cmd/uniq.1 @@ -0,0 +1,54 @@ +.\" utils-std: Collection of commonly available Unix tools +.\" Copyright 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +.\" SPDX-License-Identifier: MPL-2.0 +.Dd 2024-05-02 +.Dt UNIQ 1 +.Os +.Sh NAME +.Nm uniq +.Nd report or filter out duplicated lines +.Sh SYNOPSIS +.Nm +.Op Fl c Ns | Ns Fl d Ns | Ns Fl u +.Op Fl f Ar field_shift +.Op Fl s Ar byte_shift +.Op Ar input_file Op Ar output_file +.Sh DESCRIPTION +The +.Nm +utility reads +.Ar file1 +and +.Ar file2 +line-by-line and by default filters out repeating lines. +.Sh OPTIONS +The following options are supported: +.Bl -tag -width Ds +.It Fl c +Prefix lines with how many times they are duplicated. +.It Fl d +Only write duplicated lines (count > 1). +.It Fl f Ar field_shift +Shift the compared line by +.Ar field_shift +amount of fields, where a field is +.Aq blank +separated. +.It Fl s Ar byte_shift +Shift the compared lines by +.Ar byte_shift . +If +.Ar field_shift +was also specified, it provides an additional shift. +.It Fl u +Only write unique lines (count == 1). +.El +.Sh EXIT STATUS +.Ex -std +.Sh STANDARDS +.Nm +should be compliant with the +.St -p1003.1-2008 +specification. +.Sh AUTHORS +.An Haelwenn (lanodan) Monnier Aq Mt contact+utils@hacktivis.me diff --git a/cmd/uniq.c b/cmd/uniq.c @@ -0,0 +1,243 @@ +// utils-std: Collection of commonly available Unix tools +// SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +// SPDX-License-Identifier: MPL-2.0 + +#define _POSIX_C_SOURCE 200809L + +#include <assert.h> +#include <ctype.h> // isblank +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> // getline +#include <stdlib.h> // atoi +#include <string.h> // strncmp +#include <unistd.h> // getopt + +enum uniq_mode +{ + UNIQ, // default + COUNT, + ONLY_REPEAT, + NO_REPEAT, +}; + +int +main(int argc, char *argv[]) +{ + enum uniq_mode mode = UNIQ; + unsigned long field = 0, shift = 0; + + char *endptr = NULL; + + int c = -1; + while((c = getopt(argc, argv, ":cdf:s:u")) != -1) + { + switch(c) + { + case 'c': + if(mode != UNIQ) + { + fprintf(stderr, "uniq: Error: can only pass one of [-c|-d|-u]\n"); + return 1; + } + mode = COUNT; + break; + case 'd': + if(mode != UNIQ) + { + fprintf(stderr, "uniq: Error: can only pass one of [-c|-d|-u]\n"); + return 1; + } + mode = ONLY_REPEAT; + break; + case 'f': + errno = 0; + field = strtoul(optarg, &endptr, 0); + if(errno != 0) + { + fprintf(stderr, "uniq: Error: Failed parsing '-f %s': %s\n", optarg, strerror(errno)); + return 1; + } + if(endptr != NULL && endptr[0] != 0) + { + fprintf( + stderr, "uniq: Error: Non-numeric characters passed to '-f %s': %s\n", optarg, endptr); + return 1; + } + break; + case 's': + errno = 0; + shift = strtoul(optarg, &endptr, 0); + if(errno != 0) + { + fprintf(stderr, "uniq: Error: Failed parsing '-f %s': %s\n", optarg, strerror(errno)); + return 1; + } + if(endptr != NULL && endptr[0] != 0) + { + fprintf( + stderr, "uniq: Error: Non-numeric characters passed to '-f %s': %s\n", optarg, endptr); + return 1; + } + break; + case 'u': + if(mode != UNIQ) + { + fprintf(stderr, "uniq: Error: can only pass one of [-c|-d|-u]\n"); + return 1; + } + mode = NO_REPEAT; + break; + default: + abort(); + } + } + + argc -= optind; + argv += optind; + + assert(errno == 0); + + FILE *input = stdin; + FILE *output = stdout; + + switch(argc) + { + case 0: + break; + case 1: + input = fopen(argv[0], "r"); + if(input == NULL) + { + fprintf(stderr, "uniq: Failed opening input file '%s': %s\n", argv[0], strerror(errno)); + return 1; + } + break; + case 2: + input = fopen(argv[0], "r"); + if(input == NULL) + { + fprintf(stderr, "uniq: Failed opening input file '%s': %s\n", argv[0], strerror(errno)); + return 1; + } + output = fopen(argv[1], "w"); + if(output == NULL) + { + fprintf(stderr, "uniq: Failed opening output file '%s': %s\n", argv[1], strerror(errno)); + return 1; + } + break; + default: + fprintf(stderr, "uniq: Invalid number of arguments (%d), expected [0..2]\n", argc); + return 1; + } + + assert(errno == 0); + + char *first = NULL; + ssize_t first_len = 0; + size_t first_shift = 0; + unsigned counter = 1; + + errno = 0; + while(true) + { + assert(errno == 0); + char *cur = NULL; + size_t cur_size = 0; + ssize_t cur_len = getline(&cur, &cur_size, input); + + size_t cur_shift = shift; + + if(cur[cur_len - 1] == '\n') + { + cur[cur_len - 1] = 0; + cur_len--; + } + + if(field != 0) + { + ssize_t field_shift = 0; + + for(unsigned long i = 0; i < field; i++) + { + while(field_shift < cur_len && isblank(cur[field_shift])) + field_shift++; + + while(field_shift < cur_len && !isblank(cur[field_shift])) + field_shift++; + } + + cur_shift += field_shift; + } + + if(cur_shift > cur_len) + { + free(cur); + cur_size = 0; + cur = NULL; + break; + } + + //fprintf(stderr, "[debug] {cur_shift:%d} <%s>\n", cur_shift, cur+cur_shift); + + if(first != NULL) + { + if(cur != NULL && (cur_len - cur_shift == first_len - first_shift) && + strncmp(cur + cur_shift, first + first_shift, cur_len - cur_shift) == 0) + { + counter += 1; + } + else + { + switch(mode) + { + case UNIQ: + fwrite(first, first_len, 1, output); + fprintf(output, "\n"); + break; + case ONLY_REPEAT: + if(counter > 1) + { + fwrite(first, first_len, 1, output); + fprintf(output, "\n"); + } + break; + case NO_REPEAT: + if(counter == 1) + { + fwrite(first, first_len, 1, output); + fprintf(output, "\n"); + } + break; + case COUNT: + fprintf(output, "%d %s\n", counter, first); + break; + } + counter = 1; + free(first); + } + } + + if(cur_len < 0) + { + free(cur); + break; + } + + if(counter == 1) + { + first = cur; + first_len = cur_len; + first_shift = cur_shift; + } + } + + if(errno != 0) + { + fprintf(stderr, "uniq: Read error: %s\n", strerror(errno)); + return 1; + } + + return 0; +} diff --git a/makeless.sh b/makeless.sh @@ -54,6 +54,7 @@ $CC -std=c99 $CFLAGS -o cmd/true cmd/true.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/truncate cmd/truncate.c lib/truncation.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/tty cmd/tty.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/uname cmd/uname.c $LDFLAGS $LDSTATIC +$CC -std=c99 $CFLAGS -o cmd/uniq cmd/uniq.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/unlink cmd/unlink.c $LDFLAGS $LDSTATIC $CC -std=c99 $CFLAGS -o cmd/wc cmd/wc.c $LDFLAGS $LDSTATIC $YACC -b cmd/expr cmd/expr.y diff --git a/test-cmd/uniq_0I.txt b/test-cmd/uniq_0I.txt @@ -0,0 +1,7 @@ +#01 foo0 bar0 foo1 bar1 +#02 bar0 foo1 bar1 foo1 +#03 foo0 bar0 foo1 bar1 +#04 +#05 foo0 bar0 foo1 bar1 +#06 foo0 bar0 foo1 bar1 +#07 bar0 foo1 bar1 foo0 diff --git a/test-cmd/uniq_posix.t b/test-cmd/uniq_posix.t @@ -0,0 +1,27 @@ +#!/usr/bin/env cram +# SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me> +# SPDX-License-Identifier: MPL-2.0 + + $ export PATH="$TESTDIR/../cmd:$PATH" + + $ test "$(command -v uniq)" = "$TESTDIR/../cmd/uniq" + + $ uniq -c -f 1 "${TESTDIR}/uniq_0I.txt" + \s*1 #01 foo0 bar0 foo1 bar1 (re) + \s*1 #02 bar0 foo1 bar1 foo1 (re) + \s*1 #03 foo0 bar0 foo1 bar1 (re) + \s*1 #04 (re) + \s*2 #05 foo0 bar0 foo1 bar1 (re) + \s*1 #07 bar0 foo1 bar1 foo0 (re) + + $ uniq -d -f 1 "${TESTDIR}/uniq_0I.txt" + #05 foo0 bar0 foo1 bar1 + + $ uniq -u -f 1 "${TESTDIR}/uniq_0I.txt" + #01 foo0 bar0 foo1 bar1 + #02 bar0 foo1 bar1 foo1 + #03 foo0 bar0 foo1 bar1 + #04 + #07 bar0 foo1 bar1 foo0 + + $ uniq -d -s 2 "${TESTDIR}/uniq_0I.txt"