logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/

join.c (15999B)


  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993, 1994
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * Steve Hayman of the Computer Science Department, Indiana University,
  9. * Michiro Hikida and David Goodenough.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. */
  35. #define _POSIX_C_SOURCE 202405L
  36. #define _BSD_SOURCE // fgetln, strsep
  37. #include "../lib/reallocarray.h"
  38. #include "../libutils/err.h"
  39. #include "../libutils/getopt_nolong.h"
  40. #include <errno.h>
  41. #include <limits.h>
  42. #include <locale.h>
  43. #include <stdio.h> // fgetln
  44. #include <stdlib.h>
  45. #include <string.h> // strsep, strerror
  46. #include <sys/param.h>
  47. #include <unistd.h>
  48. #include <wchar.h>
  49. /*
  50. * There's a structure per input file which encapsulates the state of the
  51. * file. We repeatedly read lines from each file until we've read in all
  52. * the consecutive lines from the file with a common join field. Then we
  53. * compare the set of lines with an equivalent set from the other file.
  54. */
  55. typedef struct
  56. {
  57. char *line; /* line */
  58. unsigned long linealloc; /* line allocated count */
  59. char **fields; /* line field(s) */
  60. unsigned long fieldcnt; /* line field(s) count */
  61. unsigned long fieldalloc; /* line field(s) allocated count */
  62. } LINE;
  63. typedef struct
  64. {
  65. FILE *fp; /* file descriptor */
  66. unsigned long joinf; /* join field (-1, -2, -j) */
  67. int unpair; /* output unpairable lines (-a) */
  68. unsigned long number; /* 1 for file 1, 2 for file 2 */
  69. LINE *set; /* set of lines with same field */
  70. int pushbool; /* if pushback is set */
  71. unsigned long pushback; /* line on the stack */
  72. unsigned long setcnt; /* set count */
  73. unsigned long setalloc; /* set allocated count */
  74. } INPUT;
  75. static INPUT input1 = {NULL, 0, 0, 1, NULL, 0, 0, 0, 0}, input2 = {NULL, 0, 0, 2, NULL, 0, 0, 0, 0};
  76. typedef struct
  77. {
  78. unsigned long filenum; /* file number */
  79. unsigned long fieldno; /* field number */
  80. } OLIST;
  81. static OLIST *olist; /* output field list */
  82. static unsigned long olistcnt; /* output field list count */
  83. static unsigned long olistalloc; /* output field allocated count */
  84. static int joinout = 1; /* show lines with matched join fields (-v) */
  85. static int needsep; /* need separator character */
  86. static int spans = 1; /* span multiple delimiters (-t) */
  87. static char *empty; /* empty field replacement string (-e) */
  88. static wchar_t default_tabchar[] = L" \t";
  89. static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */
  90. static int cmp(LINE *, unsigned long, LINE *, unsigned long);
  91. static void fieldarg(char *);
  92. static void joinlines(INPUT *, INPUT *);
  93. static int mbscoll(const char *, const char *);
  94. static char *mbssep(char **, const wchar_t *);
  95. static void outfield(LINE *, unsigned long, int);
  96. static void outoneline(INPUT *, LINE *);
  97. static void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
  98. static void slurp(INPUT *);
  99. static wchar_t *towcs(const char *);
  100. static void usage(void);
  101. const char *argv0 = "join";
  102. int
  103. main(int argc, char *argv[])
  104. {
  105. INPUT *F1, *F2;
  106. int aflag, ch, cval, vflag;
  107. char *end;
  108. char *lc_all = setlocale(LC_ALL, "");
  109. if(lc_all == NULL)
  110. {
  111. fprintf(stderr,
  112. "%s: warning: Failed loading locales. setlocale(LC_ALL, \"\"): %s\n",
  113. argv0,
  114. strerror(errno));
  115. }
  116. F1 = &input1;
  117. F2 = &input2;
  118. aflag = vflag = 0;
  119. while((ch = getopt_nolong(argc, argv, ":a:e:j:1:2:o:t:v:")) != -1)
  120. {
  121. switch(ch)
  122. {
  123. case '1':
  124. if((F1->joinf = strtol(optarg, &end, 10)) < 1)
  125. utils_errx(1, "-1 option field number less than 1");
  126. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  127. --F1->joinf;
  128. break;
  129. case '2':
  130. if((F2->joinf = strtol(optarg, &end, 10)) < 1)
  131. utils_errx(1, "-2 option field number less than 1");
  132. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  133. --F2->joinf;
  134. break;
  135. case 'a':
  136. aflag = 1;
  137. switch(strtol(optarg, &end, 10))
  138. {
  139. case 1:
  140. F1->unpair = 1;
  141. break;
  142. case 2:
  143. F2->unpair = 1;
  144. break;
  145. default:
  146. utils_errx(1, "-a option file number not 1 or 2");
  147. break;
  148. }
  149. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  150. break;
  151. case 'e':
  152. empty = optarg;
  153. break;
  154. case 'j':
  155. if((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
  156. utils_errx(1, "-j option field number less than 1");
  157. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  158. --F1->joinf;
  159. --F2->joinf;
  160. break;
  161. case 'o':
  162. fieldarg(optarg);
  163. break;
  164. case 't':
  165. spans = 0;
  166. if(mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != strlen(optarg))
  167. utils_errx(1, "illegal tab character specification");
  168. tabchar[1] = L'\0';
  169. break;
  170. case 'v':
  171. vflag = 1;
  172. joinout = 0;
  173. switch(strtol(optarg, &end, 10))
  174. {
  175. case 1:
  176. F1->unpair = 1;
  177. break;
  178. case 2:
  179. F2->unpair = 1;
  180. break;
  181. default:
  182. utils_errx(1, "-v option file number not 1 or 2");
  183. break;
  184. }
  185. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  186. break;
  187. case ':':
  188. fprintf(stderr, "%s: error: Missing operand for option: '-%c'\n", argv0, optopt);
  189. usage();
  190. return 1;
  191. case '?':
  192. GETOPT_UNKNOWN_OPT
  193. usage();
  194. return 1;
  195. default:
  196. abort();
  197. }
  198. }
  199. argc -= optind;
  200. argv += optind;
  201. if(aflag && vflag) utils_errx(1, "the -a and -v options are mutually exclusive");
  202. if(argc != 2)
  203. {
  204. fprintf(stderr, "%s: error: Need exactly 2 file operands, got %d instead\n", argv0, argc);
  205. usage();
  206. }
  207. /* Open the files; "-" means stdin. */
  208. if(!strcmp(*argv, "-"))
  209. F1->fp = stdin;
  210. else if((F1->fp = fopen(*argv, "r")) == NULL)
  211. utils_err(1, "%s", *argv);
  212. ++argv;
  213. if(!strcmp(*argv, "-"))
  214. F2->fp = stdin;
  215. else if((F2->fp = fopen(*argv, "r")) == NULL)
  216. utils_err(1, "%s", *argv);
  217. if(F1->fp == stdin && F2->fp == stdin) utils_errx(1, "only one input file may be stdin");
  218. slurp(F1);
  219. slurp(F2);
  220. while(F1->setcnt && F2->setcnt)
  221. {
  222. cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
  223. if(cval == 0)
  224. {
  225. /* Oh joy, oh rapture, oh beauty divine! */
  226. if(joinout) joinlines(F1, F2);
  227. slurp(F1);
  228. slurp(F2);
  229. }
  230. else if(cval < 0)
  231. {
  232. /* File 1 takes the lead... */
  233. if(F1->unpair) joinlines(F1, NULL);
  234. slurp(F1);
  235. }
  236. else
  237. {
  238. /* File 2 takes the lead... */
  239. if(F2->unpair) joinlines(F2, NULL);
  240. slurp(F2);
  241. }
  242. }
  243. /*
  244. * Now that one of the files is used up, optionally output any
  245. * remaining lines from the other file.
  246. */
  247. if(F1->unpair)
  248. while(F1->setcnt)
  249. {
  250. joinlines(F1, NULL);
  251. slurp(F1);
  252. }
  253. if(F2->unpair)
  254. while(F2->setcnt)
  255. {
  256. joinlines(F2, NULL);
  257. slurp(F2);
  258. }
  259. exit(0);
  260. }
  261. static void
  262. slurp(INPUT *F)
  263. {
  264. LINE *lp, *lastlp, tmp;
  265. size_t len;
  266. int cnt;
  267. char *bp, *fieldp;
  268. /*
  269. * Read all of the lines from an input file that have the same
  270. * join field.
  271. */
  272. F->setcnt = 0;
  273. for(lastlp = NULL;; ++F->setcnt)
  274. {
  275. /*
  276. * If we're out of space to hold line structures, allocate
  277. * more. Initialize the structure so that we know that this
  278. * is new space.
  279. */
  280. if(F->setcnt == F->setalloc)
  281. {
  282. cnt = F->setalloc;
  283. F->setalloc += 50;
  284. if(F->setalloc <= 0) utils_errx(1, "slurp setalloc overflow");
  285. if((F->set = reallocarray(F->set, F->setalloc, sizeof(LINE))) == NULL)
  286. utils_err(1, "Failed (re)allocating slurp set");
  287. memset(F->set + cnt, 0, 50 * sizeof(LINE));
  288. /* re-set lastlp in case it moved */
  289. if(lastlp != NULL) lastlp = &F->set[F->setcnt - 1];
  290. }
  291. /*
  292. * Get any pushed back line, else get the next line. Allocate
  293. * space as necessary. If taking the line from the stack swap
  294. * the two structures so that we don't lose space allocated to
  295. * either structure. This could be avoided by doing another
  296. * level of indirection, but it's probably okay as is.
  297. */
  298. lp = &F->set[F->setcnt];
  299. if(F->setcnt) lastlp = &F->set[F->setcnt - 1];
  300. if(F->pushbool)
  301. {
  302. tmp = F->set[F->setcnt];
  303. F->set[F->setcnt] = F->set[F->pushback];
  304. F->set[F->pushback] = tmp;
  305. F->pushbool = 0;
  306. continue;
  307. }
  308. if((bp = fgetln(F->fp, &len)) == NULL) return;
  309. if(lp->linealloc <= len + 1)
  310. {
  311. lp->linealloc += MAX(100, len + 1 - lp->linealloc);
  312. if(lp->linealloc <= 0) utils_errx(1, "slurp linealloc overflow");
  313. if((lp->line = realloc(lp->line, lp->linealloc)) == NULL)
  314. utils_err(1, "Failed (re)allocating slurp line");
  315. }
  316. memmove(lp->line, bp, len);
  317. /* Replace trailing newline, if it exists. */
  318. if(bp[len - 1] == '\n')
  319. lp->line[len - 1] = '\0';
  320. else
  321. lp->line[len] = '\0';
  322. bp = lp->line;
  323. /* Split the line into fields, allocate space as necessary. */
  324. lp->fieldcnt = 0;
  325. while((fieldp = mbssep(&bp, tabchar)) != NULL)
  326. {
  327. if(spans && *fieldp == '\0') continue;
  328. if(lp->fieldcnt == lp->fieldalloc)
  329. {
  330. lp->fieldalloc += 50;
  331. if(lp->fieldalloc < 0) utils_errx(1, "slurp fieldalloc overflow");
  332. if((lp->fields = reallocarray(lp->fields, lp->fieldalloc, sizeof(char *))) == NULL)
  333. utils_err(1, "Failed (re)allocating slurp fieldalloc");
  334. }
  335. lp->fields[lp->fieldcnt++] = fieldp;
  336. }
  337. /* See if the join field value has changed. */
  338. if(lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf))
  339. {
  340. F->pushbool = 1;
  341. F->pushback = F->setcnt;
  342. break;
  343. }
  344. }
  345. }
  346. static char *
  347. mbssep(char **stringp, const wchar_t *delim)
  348. {
  349. char *s, *tok;
  350. const wchar_t *spanp;
  351. wchar_t c, sc;
  352. size_t n;
  353. if((s = *stringp) == NULL) return (NULL);
  354. for(tok = s;;)
  355. {
  356. n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
  357. if(n == (size_t)-1 || n == (size_t)-2)
  358. {
  359. fprintf(stderr, "join: error: mbrtowc: Illegal byte sequence\n");
  360. exit(1);
  361. }
  362. s += n;
  363. spanp = delim;
  364. do
  365. {
  366. if((sc = *spanp++) == c)
  367. {
  368. if(c == 0)
  369. s = NULL;
  370. else
  371. s[-n] = '\0';
  372. *stringp = s;
  373. return (tok);
  374. }
  375. } while(sc != 0);
  376. }
  377. }
  378. static int
  379. cmp(LINE *lp1, unsigned long fieldno1, LINE *lp2, unsigned long fieldno2)
  380. {
  381. if(lp1->fieldcnt <= fieldno1) return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
  382. if(lp2->fieldcnt <= fieldno2) return (-1);
  383. return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
  384. }
  385. static int
  386. mbscoll(const char *s1, const char *s2)
  387. {
  388. wchar_t *w1, *w2;
  389. int ret;
  390. if(MB_CUR_MAX == 1) return (strcoll(s1, s2));
  391. if((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) utils_err(1, NULL); /* XXX */
  392. ret = wcscoll(w1, w2);
  393. free(w1);
  394. free(w2);
  395. return (ret);
  396. }
  397. static wchar_t *
  398. towcs(const char *s)
  399. {
  400. wchar_t *wcs;
  401. size_t n;
  402. if((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) return (NULL);
  403. n++;
  404. if(n <= 0) return (NULL);
  405. if((wcs = calloc(n, sizeof(*wcs))) == NULL) return (NULL);
  406. mbsrtowcs(wcs, &s, n, NULL);
  407. return (wcs);
  408. }
  409. static void
  410. joinlines(INPUT *F1, INPUT *F2)
  411. {
  412. unsigned long cnt1, cnt2;
  413. /*
  414. * Output the results of a join comparison. The output may be from
  415. * either file 1 or file 2 (in which case the first argument is the
  416. * file from which to output) or from both.
  417. */
  418. if(F2 == NULL)
  419. {
  420. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  421. outoneline(F1, &F1->set[cnt1]);
  422. return;
  423. }
  424. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  425. for(cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
  426. outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
  427. }
  428. static void
  429. outoneline(INPUT *F, LINE *lp)
  430. {
  431. unsigned long cnt;
  432. /*
  433. * Output a single line from one of the files, according to the
  434. * join rules. This happens when we are writing unmatched single
  435. * lines. Output empty fields in the right places.
  436. */
  437. if(olist)
  438. for(cnt = 0; cnt < olistcnt; ++cnt)
  439. {
  440. if(olist[cnt].filenum == (unsigned)F->number)
  441. outfield(lp, olist[cnt].fieldno, 0);
  442. else if(olist[cnt].filenum == 0)
  443. outfield(lp, F->joinf, 0);
  444. else
  445. outfield(lp, 0, 1);
  446. }
  447. else
  448. {
  449. /*
  450. * Output the join field, then the remaining fields.
  451. */
  452. outfield(lp, F->joinf, 0);
  453. for(cnt = 0; cnt < lp->fieldcnt; ++cnt)
  454. if(F->joinf != cnt) outfield(lp, cnt, 0);
  455. }
  456. (void)printf("\n");
  457. if(ferror(stdout)) utils_err(1, "stdout");
  458. needsep = 0;
  459. }
  460. static void
  461. outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
  462. {
  463. unsigned long cnt;
  464. /* Output a pair of lines according to the join list (if any). */
  465. if(olist)
  466. for(cnt = 0; cnt < olistcnt; ++cnt)
  467. if(olist[cnt].filenum == 0)
  468. {
  469. if(lp1->fieldcnt >= F1->joinf)
  470. outfield(lp1, F1->joinf, 0);
  471. else
  472. outfield(lp2, F2->joinf, 0);
  473. }
  474. else if(olist[cnt].filenum == 1)
  475. outfield(lp1, olist[cnt].fieldno, 0);
  476. else /* if (olist[cnt].filenum == 2) */
  477. outfield(lp2, olist[cnt].fieldno, 0);
  478. else
  479. {
  480. /*
  481. * Output the join field, then the remaining fields from F1
  482. * and F2.
  483. */
  484. outfield(lp1, F1->joinf, 0);
  485. for(cnt = 0; cnt < lp1->fieldcnt; ++cnt)
  486. if(F1->joinf != cnt) outfield(lp1, cnt, 0);
  487. for(cnt = 0; cnt < lp2->fieldcnt; ++cnt)
  488. if(F2->joinf != cnt) outfield(lp2, cnt, 0);
  489. }
  490. (void)printf("\n");
  491. if(ferror(stdout)) utils_err(1, "stdout");
  492. needsep = 0;
  493. }
  494. static void
  495. outfield(LINE *lp, unsigned long fieldno, int out_empty)
  496. {
  497. if(needsep++) (void)printf("%lc", (wint_t)*tabchar);
  498. if(!ferror(stdout))
  499. {
  500. if(lp->fieldcnt <= fieldno || out_empty)
  501. {
  502. if(empty != NULL) (void)printf("%s", empty);
  503. }
  504. else
  505. {
  506. if(*lp->fields[fieldno] == '\0') return;
  507. (void)printf("%s", lp->fields[fieldno]);
  508. }
  509. }
  510. if(ferror(stdout)) utils_err(1, "stdout");
  511. }
  512. /*
  513. * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
  514. * fields.
  515. */
  516. static void
  517. fieldarg(char *option)
  518. {
  519. unsigned long fieldno, filenum;
  520. char *end, *token;
  521. while((token = strsep(&option, ", \t")) != NULL)
  522. {
  523. if(*token == '\0') continue;
  524. if(token[0] == '0')
  525. filenum = fieldno = 0;
  526. else if((token[0] == '1' || token[0] == '2') && token[1] == '.')
  527. {
  528. filenum = token[0] - '0';
  529. fieldno = strtol(token + 2, &end, 10);
  530. if(*end) utils_errx(1, "malformed -o option field");
  531. if(fieldno == 0) utils_errx(1, "field numbers are 1 based");
  532. --fieldno;
  533. }
  534. else
  535. utils_errx(1, "malformed -o option field");
  536. if(olistcnt == olistalloc)
  537. {
  538. olistalloc += 50;
  539. if(olistalloc <= 0) utils_errx(1, "fieldarg olistalloc overflow");
  540. if((olist = reallocarray(olist, olistalloc, sizeof(OLIST))) == NULL)
  541. utils_err(1, "Failed (re)allocating fieldarg");
  542. }
  543. olist[olistcnt].filenum = filenum;
  544. olist[olistcnt].fieldno = fieldno;
  545. ++olistcnt;
  546. }
  547. }
  548. static void
  549. usage(void)
  550. {
  551. (void)fprintf(stderr,
  552. "%s %s\n%s\n",
  553. "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
  554. "[-2 field]",
  555. " [-o list] [-t char] file1 file2");
  556. exit(1);
  557. }