logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/

join.c (15552B)


  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993, 1994
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * Steve Hayman of the Computer Science Department, Indiana University,
  9. * Michiro Hikida and David Goodenough.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. */
  35. #define _BSD_SOURCE // fgetln, strsep
  36. #include "../lib/err.h"
  37. #include "../lib/getopt_nolong.h"
  38. #include <errno.h>
  39. #include <limits.h>
  40. #include <locale.h>
  41. #include <stdio.h> // fgetln
  42. #include <stdlib.h>
  43. #include <string.h> // strsep, strerror
  44. #include <sys/param.h>
  45. #include <unistd.h>
  46. #include <wchar.h>
  47. /*
  48. * There's a structure per input file which encapsulates the state of the
  49. * file. We repeatedly read lines from each file until we've read in all
  50. * the consecutive lines from the file with a common join field. Then we
  51. * compare the set of lines with an equivalent set from the other file.
  52. */
  53. typedef struct
  54. {
  55. char *line; /* line */
  56. unsigned long linealloc; /* line allocated count */
  57. char **fields; /* line field(s) */
  58. unsigned long fieldcnt; /* line field(s) count */
  59. unsigned long fieldalloc; /* line field(s) allocated count */
  60. } LINE;
  61. typedef struct
  62. {
  63. FILE *fp; /* file descriptor */
  64. unsigned long joinf; /* join field (-1, -2, -j) */
  65. int unpair; /* output unpairable lines (-a) */
  66. unsigned long number; /* 1 for file 1, 2 for file 2 */
  67. LINE *set; /* set of lines with same field */
  68. int pushbool; /* if pushback is set */
  69. unsigned long pushback; /* line on the stack */
  70. unsigned long setcnt; /* set count */
  71. unsigned long setalloc; /* set allocated count */
  72. } INPUT;
  73. static INPUT input1 = {NULL, 0, 0, 1, NULL, 0, 0, 0, 0}, input2 = {NULL, 0, 0, 2, NULL, 0, 0, 0, 0};
  74. typedef struct
  75. {
  76. unsigned long filenum; /* file number */
  77. unsigned long fieldno; /* field number */
  78. } OLIST;
  79. static OLIST *olist; /* output field list */
  80. static unsigned long olistcnt; /* output field list count */
  81. static unsigned long olistalloc; /* output field allocated count */
  82. static int joinout = 1; /* show lines with matched join fields (-v) */
  83. static int needsep; /* need separator character */
  84. static int spans = 1; /* span multiple delimiters (-t) */
  85. static char *empty; /* empty field replacement string (-e) */
  86. static wchar_t default_tabchar[] = L" \t";
  87. static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */
  88. static int cmp(LINE *, unsigned long, LINE *, unsigned long);
  89. static void fieldarg(char *);
  90. static void joinlines(INPUT *, INPUT *);
  91. static int mbscoll(const char *, const char *);
  92. static char *mbssep(char **, const wchar_t *);
  93. static void outfield(LINE *, unsigned long, int);
  94. static void outoneline(INPUT *, LINE *);
  95. static void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
  96. static void slurp(INPUT *);
  97. static wchar_t *towcs(const char *);
  98. static void usage(void);
  99. const char *argv0 = "join";
  100. int
  101. main(int argc, char *argv[])
  102. {
  103. INPUT *F1, *F2;
  104. int aflag, ch, cval, vflag;
  105. char *end;
  106. char *lc_all = setlocale(LC_ALL, "");
  107. if(lc_all == NULL)
  108. {
  109. fprintf(stderr,
  110. "%s: warning: Failed loading locales. setlocale(LC_ALL, \"\"): %s\n",
  111. argv0,
  112. strerror(errno));
  113. }
  114. F1 = &input1;
  115. F2 = &input2;
  116. aflag = vflag = 0;
  117. while((ch = getopt_nolong(argc, argv, ":a:e:j:1:2:o:t:v:")) != -1)
  118. {
  119. switch(ch)
  120. {
  121. case '1':
  122. if((F1->joinf = strtol(optarg, &end, 10)) < 1)
  123. utils_errx(1, "-1 option field number less than 1");
  124. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  125. --F1->joinf;
  126. break;
  127. case '2':
  128. if((F2->joinf = strtol(optarg, &end, 10)) < 1)
  129. utils_errx(1, "-2 option field number less than 1");
  130. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  131. --F2->joinf;
  132. break;
  133. case 'a':
  134. aflag = 1;
  135. switch(strtol(optarg, &end, 10))
  136. {
  137. case 1:
  138. F1->unpair = 1;
  139. break;
  140. case 2:
  141. F2->unpair = 1;
  142. break;
  143. default:
  144. utils_errx(1, "-a option file number not 1 or 2");
  145. break;
  146. }
  147. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  148. break;
  149. case 'e':
  150. empty = optarg;
  151. break;
  152. case 'j':
  153. if((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
  154. utils_errx(1, "-j option field number less than 1");
  155. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  156. --F1->joinf;
  157. --F2->joinf;
  158. break;
  159. case 'o':
  160. fieldarg(optarg);
  161. break;
  162. case 't':
  163. spans = 0;
  164. if(mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != strlen(optarg))
  165. utils_errx(1, "illegal tab character specification");
  166. tabchar[1] = L'\0';
  167. break;
  168. case 'v':
  169. vflag = 1;
  170. joinout = 0;
  171. switch(strtol(optarg, &end, 10))
  172. {
  173. case 1:
  174. F1->unpair = 1;
  175. break;
  176. case 2:
  177. F2->unpair = 1;
  178. break;
  179. default:
  180. utils_errx(1, "-v option file number not 1 or 2");
  181. break;
  182. }
  183. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  184. break;
  185. case ':':
  186. fprintf(stderr, "%s: error: Missing operand for option: '-%c'\n", argv0, optopt);
  187. usage();
  188. return 1;
  189. case '?':
  190. if(!got_long_opt) fprintf(stderr, "%s: error: Unrecognised option: '-%c'\n", argv0, optopt);
  191. usage();
  192. return 1;
  193. default:
  194. abort();
  195. }
  196. }
  197. argc -= optind;
  198. argv += optind;
  199. if(aflag && vflag) utils_errx(1, "the -a and -v options are mutually exclusive");
  200. if(argc != 2)
  201. {
  202. fprintf(stderr, "%s: error: Need exactly 2 file operands, got %d instead\n", argv0, argc);
  203. usage();
  204. }
  205. /* Open the files; "-" means stdin. */
  206. if(!strcmp(*argv, "-"))
  207. F1->fp = stdin;
  208. else if((F1->fp = fopen(*argv, "r")) == NULL)
  209. utils_err(1, "%s", *argv);
  210. ++argv;
  211. if(!strcmp(*argv, "-"))
  212. F2->fp = stdin;
  213. else if((F2->fp = fopen(*argv, "r")) == NULL)
  214. utils_err(1, "%s", *argv);
  215. if(F1->fp == stdin && F2->fp == stdin) utils_errx(1, "only one input file may be stdin");
  216. slurp(F1);
  217. slurp(F2);
  218. while(F1->setcnt && F2->setcnt)
  219. {
  220. cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
  221. if(cval == 0)
  222. {
  223. /* Oh joy, oh rapture, oh beauty divine! */
  224. if(joinout) joinlines(F1, F2);
  225. slurp(F1);
  226. slurp(F2);
  227. }
  228. else if(cval < 0)
  229. {
  230. /* File 1 takes the lead... */
  231. if(F1->unpair) joinlines(F1, NULL);
  232. slurp(F1);
  233. }
  234. else
  235. {
  236. /* File 2 takes the lead... */
  237. if(F2->unpair) joinlines(F2, NULL);
  238. slurp(F2);
  239. }
  240. }
  241. /*
  242. * Now that one of the files is used up, optionally output any
  243. * remaining lines from the other file.
  244. */
  245. if(F1->unpair)
  246. while(F1->setcnt)
  247. {
  248. joinlines(F1, NULL);
  249. slurp(F1);
  250. }
  251. if(F2->unpair)
  252. while(F2->setcnt)
  253. {
  254. joinlines(F2, NULL);
  255. slurp(F2);
  256. }
  257. exit(0);
  258. }
  259. static void
  260. slurp(INPUT *F)
  261. {
  262. LINE *lp, *lastlp, tmp;
  263. size_t len;
  264. int cnt;
  265. char *bp, *fieldp;
  266. /*
  267. * Read all of the lines from an input file that have the same
  268. * join field.
  269. */
  270. F->setcnt = 0;
  271. for(lastlp = NULL;; ++F->setcnt)
  272. {
  273. /*
  274. * If we're out of space to hold line structures, allocate
  275. * more. Initialize the structure so that we know that this
  276. * is new space.
  277. */
  278. if(F->setcnt == F->setalloc)
  279. {
  280. cnt = F->setalloc;
  281. F->setalloc += 50;
  282. if((F->set = realloc(F->set, F->setalloc * sizeof(LINE))) == NULL) utils_err(1, NULL);
  283. memset(F->set + cnt, 0, 50 * sizeof(LINE));
  284. /* re-set lastlp in case it moved */
  285. if(lastlp != NULL) lastlp = &F->set[F->setcnt - 1];
  286. }
  287. /*
  288. * Get any pushed back line, else get the next line. Allocate
  289. * space as necessary. If taking the line from the stack swap
  290. * the two structures so that we don't lose space allocated to
  291. * either structure. This could be avoided by doing another
  292. * level of indirection, but it's probably okay as is.
  293. */
  294. lp = &F->set[F->setcnt];
  295. if(F->setcnt) lastlp = &F->set[F->setcnt - 1];
  296. if(F->pushbool)
  297. {
  298. tmp = F->set[F->setcnt];
  299. F->set[F->setcnt] = F->set[F->pushback];
  300. F->set[F->pushback] = tmp;
  301. F->pushbool = 0;
  302. continue;
  303. }
  304. if((bp = fgetln(F->fp, &len)) == NULL) return;
  305. if(lp->linealloc <= len + 1)
  306. {
  307. lp->linealloc += MAX(100, len + 1 - lp->linealloc);
  308. if((lp->line = realloc(lp->line, lp->linealloc)) == NULL) utils_err(1, NULL);
  309. }
  310. memmove(lp->line, bp, len);
  311. /* Replace trailing newline, if it exists. */
  312. if(bp[len - 1] == '\n')
  313. lp->line[len - 1] = '\0';
  314. else
  315. lp->line[len] = '\0';
  316. bp = lp->line;
  317. /* Split the line into fields, allocate space as necessary. */
  318. lp->fieldcnt = 0;
  319. while((fieldp = mbssep(&bp, tabchar)) != NULL)
  320. {
  321. if(spans && *fieldp == '\0') continue;
  322. if(lp->fieldcnt == lp->fieldalloc)
  323. {
  324. lp->fieldalloc += 50;
  325. if((lp->fields = realloc(lp->fields, lp->fieldalloc * sizeof(char *))) == NULL)
  326. utils_err(1, NULL);
  327. }
  328. lp->fields[lp->fieldcnt++] = fieldp;
  329. }
  330. /* See if the join field value has changed. */
  331. if(lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf))
  332. {
  333. F->pushbool = 1;
  334. F->pushback = F->setcnt;
  335. break;
  336. }
  337. }
  338. }
  339. static char *
  340. mbssep(char **stringp, const wchar_t *delim)
  341. {
  342. char *s, *tok;
  343. const wchar_t *spanp;
  344. wchar_t c, sc;
  345. size_t n;
  346. if((s = *stringp) == NULL) return (NULL);
  347. for(tok = s;;)
  348. {
  349. n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
  350. if(n == (size_t)-1 || n == (size_t)-2)
  351. {
  352. fprintf(stderr, "join: error: mbrtowc: Illegal byte sequence\n");
  353. exit(1);
  354. }
  355. s += n;
  356. spanp = delim;
  357. do
  358. {
  359. if((sc = *spanp++) == c)
  360. {
  361. if(c == 0)
  362. s = NULL;
  363. else
  364. s[-n] = '\0';
  365. *stringp = s;
  366. return (tok);
  367. }
  368. } while(sc != 0);
  369. }
  370. }
  371. static int
  372. cmp(LINE *lp1, unsigned long fieldno1, LINE *lp2, unsigned long fieldno2)
  373. {
  374. if(lp1->fieldcnt <= fieldno1) return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
  375. if(lp2->fieldcnt <= fieldno2) return (-1);
  376. return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
  377. }
  378. static int
  379. mbscoll(const char *s1, const char *s2)
  380. {
  381. wchar_t *w1, *w2;
  382. int ret;
  383. if(MB_CUR_MAX == 1) return (strcoll(s1, s2));
  384. if((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) utils_err(1, NULL); /* XXX */
  385. ret = wcscoll(w1, w2);
  386. free(w1);
  387. free(w2);
  388. return (ret);
  389. }
  390. static wchar_t *
  391. towcs(const char *s)
  392. {
  393. wchar_t *wcs;
  394. size_t n;
  395. if((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) return (NULL);
  396. if((wcs = calloc((n + 1), sizeof(*wcs))) == NULL) return (NULL);
  397. mbsrtowcs(wcs, &s, n + 1, NULL);
  398. return (wcs);
  399. }
  400. static void
  401. joinlines(INPUT *F1, INPUT *F2)
  402. {
  403. unsigned long cnt1, cnt2;
  404. /*
  405. * Output the results of a join comparison. The output may be from
  406. * either file 1 or file 2 (in which case the first argument is the
  407. * file from which to output) or from both.
  408. */
  409. if(F2 == NULL)
  410. {
  411. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  412. outoneline(F1, &F1->set[cnt1]);
  413. return;
  414. }
  415. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  416. for(cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
  417. outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
  418. }
  419. static void
  420. outoneline(INPUT *F, LINE *lp)
  421. {
  422. unsigned long cnt;
  423. /*
  424. * Output a single line from one of the files, according to the
  425. * join rules. This happens when we are writing unmatched single
  426. * lines. Output empty fields in the right places.
  427. */
  428. if(olist)
  429. for(cnt = 0; cnt < olistcnt; ++cnt)
  430. {
  431. if(olist[cnt].filenum == (unsigned)F->number)
  432. outfield(lp, olist[cnt].fieldno, 0);
  433. else if(olist[cnt].filenum == 0)
  434. outfield(lp, F->joinf, 0);
  435. else
  436. outfield(lp, 0, 1);
  437. }
  438. else
  439. {
  440. /*
  441. * Output the join field, then the remaining fields.
  442. */
  443. outfield(lp, F->joinf, 0);
  444. for(cnt = 0; cnt < lp->fieldcnt; ++cnt)
  445. if(F->joinf != cnt) outfield(lp, cnt, 0);
  446. }
  447. (void)printf("\n");
  448. if(ferror(stdout)) utils_err(1, "stdout");
  449. needsep = 0;
  450. }
  451. static void
  452. outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
  453. {
  454. unsigned long cnt;
  455. /* Output a pair of lines according to the join list (if any). */
  456. if(olist)
  457. for(cnt = 0; cnt < olistcnt; ++cnt)
  458. if(olist[cnt].filenum == 0)
  459. {
  460. if(lp1->fieldcnt >= F1->joinf)
  461. outfield(lp1, F1->joinf, 0);
  462. else
  463. outfield(lp2, F2->joinf, 0);
  464. }
  465. else if(olist[cnt].filenum == 1)
  466. outfield(lp1, olist[cnt].fieldno, 0);
  467. else /* if (olist[cnt].filenum == 2) */
  468. outfield(lp2, olist[cnt].fieldno, 0);
  469. else
  470. {
  471. /*
  472. * Output the join field, then the remaining fields from F1
  473. * and F2.
  474. */
  475. outfield(lp1, F1->joinf, 0);
  476. for(cnt = 0; cnt < lp1->fieldcnt; ++cnt)
  477. if(F1->joinf != cnt) outfield(lp1, cnt, 0);
  478. for(cnt = 0; cnt < lp2->fieldcnt; ++cnt)
  479. if(F2->joinf != cnt) outfield(lp2, cnt, 0);
  480. }
  481. (void)printf("\n");
  482. if(ferror(stdout)) utils_err(1, "stdout");
  483. needsep = 0;
  484. }
  485. static void
  486. outfield(LINE *lp, unsigned long fieldno, int out_empty)
  487. {
  488. if(needsep++) (void)printf("%lc", (wint_t)*tabchar);
  489. if(!ferror(stdout))
  490. {
  491. if(lp->fieldcnt <= fieldno || out_empty)
  492. {
  493. if(empty != NULL) (void)printf("%s", empty);
  494. }
  495. else
  496. {
  497. if(*lp->fields[fieldno] == '\0') return;
  498. (void)printf("%s", lp->fields[fieldno]);
  499. }
  500. }
  501. if(ferror(stdout)) utils_err(1, "stdout");
  502. }
  503. /*
  504. * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
  505. * fields.
  506. */
  507. static void
  508. fieldarg(char *option)
  509. {
  510. unsigned long fieldno, filenum;
  511. char *end, *token;
  512. while((token = strsep(&option, ", \t")) != NULL)
  513. {
  514. if(*token == '\0') continue;
  515. if(token[0] == '0')
  516. filenum = fieldno = 0;
  517. else if((token[0] == '1' || token[0] == '2') && token[1] == '.')
  518. {
  519. filenum = token[0] - '0';
  520. fieldno = strtol(token + 2, &end, 10);
  521. if(*end) utils_errx(1, "malformed -o option field");
  522. if(fieldno == 0) utils_errx(1, "field numbers are 1 based");
  523. --fieldno;
  524. }
  525. else
  526. utils_errx(1, "malformed -o option field");
  527. if(olistcnt == olistalloc)
  528. {
  529. olistalloc += 50;
  530. if((olist = realloc(olist, olistalloc * sizeof(OLIST))) == NULL) utils_err(1, NULL);
  531. }
  532. olist[olistcnt].filenum = filenum;
  533. olist[olistcnt].fieldno = fieldno;
  534. ++olistcnt;
  535. }
  536. }
  537. static void
  538. usage(void)
  539. {
  540. (void)fprintf(stderr,
  541. "%s %s\n%s\n",
  542. "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
  543. "[-2 field]",
  544. " [-o list] [-t char] file1 file2");
  545. exit(1);
  546. }