logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/

join.c (16033B)


  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993, 1994
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * Steve Hayman of the Computer Science Department, Indiana University,
  9. * Michiro Hikida and David Goodenough.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. */
  35. #define _POSIX_C_SOURCE 202405L
  36. #define _BSD_SOURCE // fgetln, strsep
  37. #include "../lib/reallocarray.h"
  38. #include "../libutils/err.h"
  39. #include "../libutils/getopt_nolong.h"
  40. #include <assert.h>
  41. #include <errno.h>
  42. #include <limits.h>
  43. #include <locale.h>
  44. #include <stdio.h> // fgetln
  45. #include <stdlib.h>
  46. #include <string.h> // strsep, strerror
  47. #include <sys/param.h>
  48. #include <unistd.h>
  49. #include <wchar.h>
  50. /*
  51. * There's a structure per input file which encapsulates the state of the
  52. * file. We repeatedly read lines from each file until we've read in all
  53. * the consecutive lines from the file with a common join field. Then we
  54. * compare the set of lines with an equivalent set from the other file.
  55. */
  56. typedef struct
  57. {
  58. char *line; /* line */
  59. unsigned long linealloc; /* line allocated count */
  60. char **fields; /* line field(s) */
  61. unsigned long fieldcnt; /* line field(s) count */
  62. unsigned long fieldalloc; /* line field(s) allocated count */
  63. } LINE;
  64. typedef struct
  65. {
  66. FILE *fp; /* file descriptor */
  67. unsigned long joinf; /* join field (-1, -2, -j) */
  68. int unpair; /* output unpairable lines (-a) */
  69. unsigned long number; /* 1 for file 1, 2 for file 2 */
  70. LINE *set; /* set of lines with same field */
  71. int pushbool; /* if pushback is set */
  72. unsigned long pushback; /* line on the stack */
  73. unsigned long setcnt; /* set count */
  74. unsigned long setalloc; /* set allocated count */
  75. } INPUT;
  76. static INPUT input1 = {NULL, 0, 0, 1, NULL, 0, 0, 0, 0}, input2 = {NULL, 0, 0, 2, NULL, 0, 0, 0, 0};
  77. typedef struct
  78. {
  79. unsigned long filenum; /* file number */
  80. unsigned long fieldno; /* field number */
  81. } OLIST;
  82. static OLIST *olist; /* output field list */
  83. static unsigned long olistcnt; /* output field list count */
  84. static unsigned long olistalloc; /* output field allocated count */
  85. static int joinout = 1; /* show lines with matched join fields (-v) */
  86. static int needsep; /* need separator character */
  87. static int spans = 1; /* span multiple delimiters (-t) */
  88. static char *empty; /* empty field replacement string (-e) */
  89. static wchar_t default_tabchar[] = L" \t";
  90. static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */
  91. static int cmp(LINE *, unsigned long, LINE *, unsigned long);
  92. static void fieldarg(char *);
  93. static void joinlines(INPUT *, INPUT *);
  94. static int mbscoll(const char *, const char *);
  95. static char *mbssep(char **, const wchar_t *);
  96. static void outfield(LINE *, unsigned long, int);
  97. static void outoneline(INPUT *, LINE *);
  98. static void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
  99. static void slurp(INPUT *);
  100. static wchar_t *towcs(const char *);
  101. static void usage(void);
  102. const char *argv0 = "join";
  103. int
  104. main(int argc, char *argv[])
  105. {
  106. INPUT *F1, *F2;
  107. int aflag, ch, cval, vflag;
  108. char *end;
  109. char *lc_all = setlocale(LC_ALL, "");
  110. if(lc_all == NULL)
  111. {
  112. fprintf(stderr,
  113. "%s: warning: Failed loading locales. setlocale(LC_ALL, \"\"): %s\n",
  114. argv0,
  115. strerror(errno));
  116. }
  117. F1 = &input1;
  118. F2 = &input2;
  119. aflag = vflag = 0;
  120. while((ch = getopt_nolong(argc, argv, ":a:e:j:1:2:o:t:v:")) != -1)
  121. {
  122. switch(ch)
  123. {
  124. case '1':
  125. if((F1->joinf = strtol(optarg, &end, 10)) < 1)
  126. utils_errx(1, "-1 option field number less than 1");
  127. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  128. --F1->joinf;
  129. break;
  130. case '2':
  131. if((F2->joinf = strtol(optarg, &end, 10)) < 1)
  132. utils_errx(1, "-2 option field number less than 1");
  133. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  134. --F2->joinf;
  135. break;
  136. case 'a':
  137. aflag = 1;
  138. switch(strtol(optarg, &end, 10))
  139. {
  140. case 1:
  141. F1->unpair = 1;
  142. break;
  143. case 2:
  144. F2->unpair = 1;
  145. break;
  146. default:
  147. utils_errx(1, "-a option file number not 1 or 2");
  148. break;
  149. }
  150. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  151. break;
  152. case 'e':
  153. empty = optarg;
  154. break;
  155. case 'j':
  156. if((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
  157. utils_errx(1, "-j option field number less than 1");
  158. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  159. --F1->joinf;
  160. --F2->joinf;
  161. break;
  162. case 'o':
  163. fieldarg(optarg);
  164. break;
  165. case 't':
  166. spans = 0;
  167. if(mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != strlen(optarg))
  168. utils_errx(1, "illegal tab character specification");
  169. tabchar[1] = L'\0';
  170. break;
  171. case 'v':
  172. vflag = 1;
  173. joinout = 0;
  174. switch(strtol(optarg, &end, 10))
  175. {
  176. case 1:
  177. F1->unpair = 1;
  178. break;
  179. case 2:
  180. F2->unpair = 1;
  181. break;
  182. default:
  183. utils_errx(1, "-v option file number not 1 or 2");
  184. break;
  185. }
  186. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  187. break;
  188. case ':':
  189. fprintf(stderr, "%s: error: Missing operand for option: '-%c'\n", argv0, optopt);
  190. usage();
  191. return 1;
  192. case '?':
  193. GETOPT_UNKNOWN_OPT
  194. usage();
  195. return 1;
  196. default:
  197. abort();
  198. }
  199. }
  200. argc -= optind;
  201. argv += optind;
  202. if(aflag && vflag) utils_errx(1, "the -a and -v options are mutually exclusive");
  203. if(argc != 2)
  204. {
  205. fprintf(stderr, "%s: error: Need exactly 2 file operands, got %d instead\n", argv0, argc);
  206. usage();
  207. }
  208. /* Open the files; "-" means stdin. */
  209. if(!strcmp(*argv, "-"))
  210. F1->fp = stdin;
  211. else if((F1->fp = fopen(*argv, "r")) == NULL)
  212. utils_err(1, "%s", *argv);
  213. ++argv;
  214. if(!strcmp(*argv, "-"))
  215. F2->fp = stdin;
  216. else if((F2->fp = fopen(*argv, "r")) == NULL)
  217. utils_err(1, "%s", *argv);
  218. if(F1->fp == stdin && F2->fp == stdin) utils_errx(1, "only one input file may be stdin");
  219. slurp(F1);
  220. slurp(F2);
  221. while(F1->setcnt && F2->setcnt)
  222. {
  223. cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
  224. if(cval == 0)
  225. {
  226. /* Oh joy, oh rapture, oh beauty divine! */
  227. if(joinout) joinlines(F1, F2);
  228. slurp(F1);
  229. slurp(F2);
  230. }
  231. else if(cval < 0)
  232. {
  233. /* File 1 takes the lead... */
  234. if(F1->unpair) joinlines(F1, NULL);
  235. slurp(F1);
  236. }
  237. else
  238. {
  239. /* File 2 takes the lead... */
  240. if(F2->unpair) joinlines(F2, NULL);
  241. slurp(F2);
  242. }
  243. }
  244. /*
  245. * Now that one of the files is used up, optionally output any
  246. * remaining lines from the other file.
  247. */
  248. if(F1->unpair)
  249. while(F1->setcnt)
  250. {
  251. joinlines(F1, NULL);
  252. slurp(F1);
  253. }
  254. if(F2->unpair)
  255. while(F2->setcnt)
  256. {
  257. joinlines(F2, NULL);
  258. slurp(F2);
  259. }
  260. exit(0);
  261. }
  262. static void
  263. slurp(INPUT *F)
  264. {
  265. LINE *lp, *lastlp, tmp;
  266. size_t len;
  267. int cnt;
  268. char *bp, *fieldp;
  269. /*
  270. * Read all of the lines from an input file that have the same
  271. * join field.
  272. */
  273. F->setcnt = 0;
  274. for(lastlp = NULL;; ++F->setcnt)
  275. {
  276. /*
  277. * If we're out of space to hold line structures, allocate
  278. * more. Initialize the structure so that we know that this
  279. * is new space.
  280. */
  281. if(F->setcnt == F->setalloc)
  282. {
  283. cnt = F->setalloc;
  284. F->setalloc += 50;
  285. if(F->setalloc <= 0) utils_errx(1, "slurp setalloc overflow");
  286. if((F->set = reallocarray(F->set, F->setalloc, sizeof(LINE))) == NULL)
  287. utils_err(1, "Failed (re)allocating slurp set");
  288. memset(F->set + cnt, 0, 50 * sizeof(LINE));
  289. /* re-set lastlp in case it moved */
  290. if(lastlp != NULL) lastlp = &F->set[F->setcnt - 1];
  291. }
  292. /*
  293. * Get any pushed back line, else get the next line. Allocate
  294. * space as necessary. If taking the line from the stack swap
  295. * the two structures so that we don't lose space allocated to
  296. * either structure. This could be avoided by doing another
  297. * level of indirection, but it's probably okay as is.
  298. */
  299. lp = &F->set[F->setcnt];
  300. assert(lp);
  301. if(F->setcnt) lastlp = &F->set[F->setcnt - 1];
  302. if(F->pushbool)
  303. {
  304. tmp = F->set[F->setcnt];
  305. F->set[F->setcnt] = F->set[F->pushback];
  306. F->set[F->pushback] = tmp;
  307. F->pushbool = 0;
  308. continue;
  309. }
  310. if((bp = fgetln(F->fp, &len)) == NULL) return;
  311. if(lp->linealloc <= len + 1)
  312. {
  313. lp->linealloc += MAX(100, len + 1 - lp->linealloc);
  314. if(lp->linealloc <= 0) utils_errx(1, "slurp linealloc overflow");
  315. if((lp->line = realloc(lp->line, lp->linealloc)) == NULL)
  316. utils_err(1, "Failed (re)allocating slurp line");
  317. }
  318. memmove(lp->line, bp, len);
  319. /* Replace trailing newline, if it exists. */
  320. if(bp[len - 1] == '\n')
  321. lp->line[len - 1] = '\0';
  322. else
  323. lp->line[len] = '\0';
  324. bp = lp->line;
  325. /* Split the line into fields, allocate space as necessary. */
  326. lp->fieldcnt = 0;
  327. while((fieldp = mbssep(&bp, tabchar)) != NULL)
  328. {
  329. if(spans && *fieldp == '\0') continue;
  330. if(lp->fieldcnt == lp->fieldalloc)
  331. {
  332. lp->fieldalloc += 50;
  333. if(lp->fieldalloc < 0) utils_errx(1, "slurp fieldalloc overflow");
  334. if((lp->fields = reallocarray(lp->fields, lp->fieldalloc, sizeof(char *))) == NULL)
  335. utils_err(1, "Failed (re)allocating slurp fieldalloc");
  336. }
  337. lp->fields[lp->fieldcnt++] = fieldp;
  338. }
  339. /* See if the join field value has changed. */
  340. if(lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf))
  341. {
  342. F->pushbool = 1;
  343. F->pushback = F->setcnt;
  344. break;
  345. }
  346. }
  347. }
  348. static char *
  349. mbssep(char **stringp, const wchar_t *delim)
  350. {
  351. char *s, *tok;
  352. const wchar_t *spanp;
  353. wchar_t c, sc;
  354. size_t n;
  355. if((s = *stringp) == NULL) return (NULL);
  356. for(tok = s;;)
  357. {
  358. n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
  359. if(n == (size_t)-1 || n == (size_t)-2)
  360. {
  361. fprintf(stderr, "join: error: mbrtowc: Illegal byte sequence\n");
  362. exit(1);
  363. }
  364. s += n;
  365. spanp = delim;
  366. do
  367. {
  368. if((sc = *spanp++) == c)
  369. {
  370. if(c == 0)
  371. s = NULL;
  372. else
  373. s[-n] = '\0';
  374. *stringp = s;
  375. return (tok);
  376. }
  377. } while(sc != 0);
  378. }
  379. }
  380. static int
  381. cmp(LINE *lp1, unsigned long fieldno1, LINE *lp2, unsigned long fieldno2)
  382. {
  383. if(lp1->fieldcnt <= fieldno1) return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
  384. if(lp2->fieldcnt <= fieldno2) return (-1);
  385. return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
  386. }
  387. static int
  388. mbscoll(const char *s1, const char *s2)
  389. {
  390. wchar_t *w1, *w2;
  391. int ret;
  392. if(MB_CUR_MAX == 1) return (strcoll(s1, s2));
  393. if((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) utils_err(1, NULL); /* XXX */
  394. ret = wcscoll(w1, w2);
  395. free(w1);
  396. free(w2);
  397. return (ret);
  398. }
  399. static wchar_t *
  400. towcs(const char *s)
  401. {
  402. wchar_t *wcs;
  403. size_t n;
  404. if((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) return (NULL);
  405. n++;
  406. if(n <= 0) return (NULL);
  407. if((wcs = calloc(n, sizeof(*wcs))) == NULL) return (NULL);
  408. mbsrtowcs(wcs, &s, n, NULL);
  409. return (wcs);
  410. }
  411. static void
  412. joinlines(INPUT *F1, INPUT *F2)
  413. {
  414. unsigned long cnt1, cnt2;
  415. /*
  416. * Output the results of a join comparison. The output may be from
  417. * either file 1 or file 2 (in which case the first argument is the
  418. * file from which to output) or from both.
  419. */
  420. if(F2 == NULL)
  421. {
  422. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  423. outoneline(F1, &F1->set[cnt1]);
  424. return;
  425. }
  426. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  427. for(cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
  428. outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
  429. }
  430. static void
  431. outoneline(INPUT *F, LINE *lp)
  432. {
  433. unsigned long cnt;
  434. /*
  435. * Output a single line from one of the files, according to the
  436. * join rules. This happens when we are writing unmatched single
  437. * lines. Output empty fields in the right places.
  438. */
  439. if(olist)
  440. for(cnt = 0; cnt < olistcnt; ++cnt)
  441. {
  442. if(olist[cnt].filenum == (unsigned)F->number)
  443. outfield(lp, olist[cnt].fieldno, 0);
  444. else if(olist[cnt].filenum == 0)
  445. outfield(lp, F->joinf, 0);
  446. else
  447. outfield(lp, 0, 1);
  448. }
  449. else
  450. {
  451. /*
  452. * Output the join field, then the remaining fields.
  453. */
  454. outfield(lp, F->joinf, 0);
  455. for(cnt = 0; cnt < lp->fieldcnt; ++cnt)
  456. if(F->joinf != cnt) outfield(lp, cnt, 0);
  457. }
  458. (void)printf("\n");
  459. if(ferror(stdout)) utils_err(1, "stdout");
  460. needsep = 0;
  461. }
  462. static void
  463. outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
  464. {
  465. unsigned long cnt;
  466. /* Output a pair of lines according to the join list (if any). */
  467. if(olist)
  468. for(cnt = 0; cnt < olistcnt; ++cnt)
  469. if(olist[cnt].filenum == 0)
  470. {
  471. if(lp1->fieldcnt >= F1->joinf)
  472. outfield(lp1, F1->joinf, 0);
  473. else
  474. outfield(lp2, F2->joinf, 0);
  475. }
  476. else if(olist[cnt].filenum == 1)
  477. outfield(lp1, olist[cnt].fieldno, 0);
  478. else /* if (olist[cnt].filenum == 2) */
  479. outfield(lp2, olist[cnt].fieldno, 0);
  480. else
  481. {
  482. /*
  483. * Output the join field, then the remaining fields from F1
  484. * and F2.
  485. */
  486. outfield(lp1, F1->joinf, 0);
  487. for(cnt = 0; cnt < lp1->fieldcnt; ++cnt)
  488. if(F1->joinf != cnt) outfield(lp1, cnt, 0);
  489. for(cnt = 0; cnt < lp2->fieldcnt; ++cnt)
  490. if(F2->joinf != cnt) outfield(lp2, cnt, 0);
  491. }
  492. (void)printf("\n");
  493. if(ferror(stdout)) utils_err(1, "stdout");
  494. needsep = 0;
  495. }
  496. static void
  497. outfield(LINE *lp, unsigned long fieldno, int out_empty)
  498. {
  499. if(needsep++) (void)printf("%lc", (wint_t)*tabchar);
  500. if(!ferror(stdout))
  501. {
  502. if(lp->fieldcnt <= fieldno || out_empty)
  503. {
  504. if(empty != NULL) (void)printf("%s", empty);
  505. }
  506. else
  507. {
  508. if(*lp->fields[fieldno] == '\0') return;
  509. (void)printf("%s", lp->fields[fieldno]);
  510. }
  511. }
  512. if(ferror(stdout)) utils_err(1, "stdout");
  513. }
  514. /*
  515. * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
  516. * fields.
  517. */
  518. static void
  519. fieldarg(char *option)
  520. {
  521. unsigned long fieldno, filenum;
  522. char *end, *token;
  523. while((token = strsep(&option, ", \t")) != NULL)
  524. {
  525. if(*token == '\0') continue;
  526. if(token[0] == '0')
  527. filenum = fieldno = 0;
  528. else if((token[0] == '1' || token[0] == '2') && token[1] == '.')
  529. {
  530. filenum = token[0] - '0';
  531. fieldno = strtol(token + 2, &end, 10);
  532. if(*end) utils_errx(1, "malformed -o option field");
  533. if(fieldno == 0) utils_errx(1, "field numbers are 1 based");
  534. --fieldno;
  535. }
  536. else
  537. utils_errx(1, "malformed -o option field");
  538. if(olistcnt == olistalloc)
  539. {
  540. olistalloc += 50;
  541. if(olistalloc <= 0) utils_errx(1, "fieldarg olistalloc overflow");
  542. if((olist = reallocarray(olist, olistalloc, sizeof(OLIST))) == NULL)
  543. utils_err(1, "Failed (re)allocating fieldarg");
  544. }
  545. olist[olistcnt].filenum = filenum;
  546. olist[olistcnt].fieldno = fieldno;
  547. ++olistcnt;
  548. }
  549. }
  550. static void
  551. usage(void)
  552. {
  553. (void)fprintf(stderr,
  554. "%s %s\n%s\n",
  555. "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
  556. "[-2 field]",
  557. " [-o list] [-t char] file1 file2");
  558. exit(1);
  559. }