logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git

join.c (15244B)


  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993, 1994
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * Steve Hayman of the Computer Science Department, Indiana University,
  9. * Michiro Hikida and David Goodenough.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. */
  35. #define _BSD_SOURCE
  36. #include <err.h>
  37. #include <errno.h>
  38. #include <limits.h>
  39. #include <locale.h>
  40. #include <stdio.h>
  41. #include <stdlib.h>
  42. #include <string.h>
  43. #include <sys/param.h>
  44. #include <unistd.h>
  45. #include <wchar.h>
  46. /*
  47. * There's a structure per input file which encapsulates the state of the
  48. * file. We repeatedly read lines from each file until we've read in all
  49. * the consecutive lines from the file with a common join field. Then we
  50. * compare the set of lines with an equivalent set from the other file.
  51. */
  52. typedef struct
  53. {
  54. char *line; /* line */
  55. unsigned long linealloc; /* line allocated count */
  56. char **fields; /* line field(s) */
  57. unsigned long fieldcnt; /* line field(s) count */
  58. unsigned long fieldalloc; /* line field(s) allocated count */
  59. } LINE;
  60. typedef struct
  61. {
  62. FILE *fp; /* file descriptor */
  63. unsigned long joinf; /* join field (-1, -2, -j) */
  64. int unpair; /* output unpairable lines (-a) */
  65. unsigned long number; /* 1 for file 1, 2 for file 2 */
  66. LINE *set; /* set of lines with same field */
  67. int pushbool; /* if pushback is set */
  68. unsigned long pushback; /* line on the stack */
  69. unsigned long setcnt; /* set count */
  70. unsigned long setalloc; /* set allocated count */
  71. } INPUT;
  72. static INPUT input1 = {NULL, 0, 0, 1, NULL, 0, 0, 0, 0}, input2 = {NULL, 0, 0, 2, NULL, 0, 0, 0, 0};
  73. typedef struct
  74. {
  75. unsigned long filenum; /* file number */
  76. unsigned long fieldno; /* field number */
  77. } OLIST;
  78. static OLIST *olist; /* output field list */
  79. static unsigned long olistcnt; /* output field list count */
  80. static unsigned long olistalloc; /* output field allocated count */
  81. static int joinout = 1; /* show lines with matched join fields (-v) */
  82. static int needsep; /* need separator character */
  83. static int spans = 1; /* span multiple delimiters (-t) */
  84. static char *empty; /* empty field replacement string (-e) */
  85. static wchar_t default_tabchar[] = L" \t";
  86. static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */
  87. static int cmp(LINE *, unsigned long, LINE *, unsigned long);
  88. static void fieldarg(char *);
  89. static void joinlines(INPUT *, INPUT *);
  90. static int mbscoll(const char *, const char *);
  91. static char *mbssep(char **, const wchar_t *);
  92. static void outfield(LINE *, unsigned long, int);
  93. static void outoneline(INPUT *, LINE *);
  94. static void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
  95. static void slurp(INPUT *);
  96. static wchar_t *towcs(const char *);
  97. static void usage(void);
  98. const char *argv0 = "join";
  99. int
  100. main(int argc, char *argv[])
  101. {
  102. INPUT *F1, *F2;
  103. int aflag, ch, cval, vflag;
  104. char *end;
  105. setlocale(LC_ALL, "");
  106. F1 = &input1;
  107. F2 = &input2;
  108. aflag = vflag = 0;
  109. while((ch = getopt(argc, argv, ":a:e:j:1:2:o:t:v:")) != -1)
  110. {
  111. switch(ch)
  112. {
  113. case '1':
  114. if((F1->joinf = strtol(optarg, &end, 10)) < 1)
  115. errx(1, "error: -1 option field number less than 1");
  116. if(*end) errx(1, "error: illegal field number -- %s", optarg);
  117. --F1->joinf;
  118. break;
  119. case '2':
  120. if((F2->joinf = strtol(optarg, &end, 10)) < 1)
  121. errx(1, "error: -2 option field number less than 1");
  122. if(*end) errx(1, "error: illegal field number -- %s", optarg);
  123. --F2->joinf;
  124. break;
  125. case 'a':
  126. aflag = 1;
  127. switch(strtol(optarg, &end, 10))
  128. {
  129. case 1:
  130. F1->unpair = 1;
  131. break;
  132. case 2:
  133. F2->unpair = 1;
  134. break;
  135. default:
  136. errx(1, "error: -a option file number not 1 or 2");
  137. break;
  138. }
  139. if(*end) errx(1, "error: illegal file number -- %s", optarg);
  140. break;
  141. case 'e':
  142. empty = optarg;
  143. break;
  144. case 'j':
  145. if((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
  146. errx(1, "error: -j option field number less than 1");
  147. if(*end) errx(1, "error: illegal field number -- %s", optarg);
  148. --F1->joinf;
  149. --F2->joinf;
  150. break;
  151. case 'o':
  152. fieldarg(optarg);
  153. break;
  154. case 't':
  155. spans = 0;
  156. if(mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != strlen(optarg))
  157. errx(1, "error: illegal tab character specification");
  158. tabchar[1] = L'\0';
  159. break;
  160. case 'v':
  161. vflag = 1;
  162. joinout = 0;
  163. switch(strtol(optarg, &end, 10))
  164. {
  165. case 1:
  166. F1->unpair = 1;
  167. break;
  168. case 2:
  169. F2->unpair = 1;
  170. break;
  171. default:
  172. errx(1, "error: -v option file number not 1 or 2");
  173. break;
  174. }
  175. if(*end) errx(1, "error: illegal file number -- %s", optarg);
  176. break;
  177. case ':':
  178. fprintf(stderr, "%s: error: Missing operand for option: '-%c'\n", argv0, optopt);
  179. usage();
  180. return 1;
  181. case '?':
  182. fprintf(stderr, "%s: error: Unrecognised option: '-%c'\n", argv0, optopt);
  183. usage();
  184. return 1;
  185. default:
  186. abort();
  187. }
  188. }
  189. argc -= optind;
  190. argv += optind;
  191. if(aflag && vflag) errx(1, "error: the -a and -v options are mutually exclusive");
  192. if(argc != 2)
  193. {
  194. fprintf(stderr, "%s: error: Need exactly 2 file operands, got %d instead\n", argv0, argc);
  195. usage();
  196. }
  197. /* Open the files; "-" means stdin. */
  198. if(!strcmp(*argv, "-"))
  199. F1->fp = stdin;
  200. else if((F1->fp = fopen(*argv, "r")) == NULL)
  201. err(1, "error: %s", *argv);
  202. ++argv;
  203. if(!strcmp(*argv, "-"))
  204. F2->fp = stdin;
  205. else if((F2->fp = fopen(*argv, "r")) == NULL)
  206. err(1, "error: %s", *argv);
  207. if(F1->fp == stdin && F2->fp == stdin) errx(1, "error: only one input file may be stdin");
  208. slurp(F1);
  209. slurp(F2);
  210. while(F1->setcnt && F2->setcnt)
  211. {
  212. cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
  213. if(cval == 0)
  214. {
  215. /* Oh joy, oh rapture, oh beauty divine! */
  216. if(joinout) joinlines(F1, F2);
  217. slurp(F1);
  218. slurp(F2);
  219. }
  220. else if(cval < 0)
  221. {
  222. /* File 1 takes the lead... */
  223. if(F1->unpair) joinlines(F1, NULL);
  224. slurp(F1);
  225. }
  226. else
  227. {
  228. /* File 2 takes the lead... */
  229. if(F2->unpair) joinlines(F2, NULL);
  230. slurp(F2);
  231. }
  232. }
  233. /*
  234. * Now that one of the files is used up, optionally output any
  235. * remaining lines from the other file.
  236. */
  237. if(F1->unpair)
  238. while(F1->setcnt)
  239. {
  240. joinlines(F1, NULL);
  241. slurp(F1);
  242. }
  243. if(F2->unpair)
  244. while(F2->setcnt)
  245. {
  246. joinlines(F2, NULL);
  247. slurp(F2);
  248. }
  249. exit(0);
  250. }
  251. static void
  252. slurp(INPUT *F)
  253. {
  254. LINE *lp, *lastlp, tmp;
  255. size_t len;
  256. int cnt;
  257. char *bp, *fieldp;
  258. /*
  259. * Read all of the lines from an input file that have the same
  260. * join field.
  261. */
  262. F->setcnt = 0;
  263. for(lastlp = NULL;; ++F->setcnt)
  264. {
  265. /*
  266. * If we're out of space to hold line structures, allocate
  267. * more. Initialize the structure so that we know that this
  268. * is new space.
  269. */
  270. if(F->setcnt == F->setalloc)
  271. {
  272. cnt = F->setalloc;
  273. F->setalloc += 50;
  274. if((F->set = realloc(F->set, F->setalloc * sizeof(LINE))) == NULL) err(1, NULL);
  275. memset(F->set + cnt, 0, 50 * sizeof(LINE));
  276. /* re-set lastlp in case it moved */
  277. if(lastlp != NULL) lastlp = &F->set[F->setcnt - 1];
  278. }
  279. /*
  280. * Get any pushed back line, else get the next line. Allocate
  281. * space as necessary. If taking the line from the stack swap
  282. * the two structures so that we don't lose space allocated to
  283. * either structure. This could be avoided by doing another
  284. * level of indirection, but it's probably okay as is.
  285. */
  286. lp = &F->set[F->setcnt];
  287. if(F->setcnt) lastlp = &F->set[F->setcnt - 1];
  288. if(F->pushbool)
  289. {
  290. tmp = F->set[F->setcnt];
  291. F->set[F->setcnt] = F->set[F->pushback];
  292. F->set[F->pushback] = tmp;
  293. F->pushbool = 0;
  294. continue;
  295. }
  296. if((bp = fgetln(F->fp, &len)) == NULL) return;
  297. if(lp->linealloc <= len + 1)
  298. {
  299. lp->linealloc += MAX(100, len + 1 - lp->linealloc);
  300. if((lp->line = realloc(lp->line, lp->linealloc)) == NULL) err(1, NULL);
  301. }
  302. memmove(lp->line, bp, len);
  303. /* Replace trailing newline, if it exists. */
  304. if(bp[len - 1] == '\n')
  305. lp->line[len - 1] = '\0';
  306. else
  307. lp->line[len] = '\0';
  308. bp = lp->line;
  309. /* Split the line into fields, allocate space as necessary. */
  310. lp->fieldcnt = 0;
  311. while((fieldp = mbssep(&bp, tabchar)) != NULL)
  312. {
  313. if(spans && *fieldp == '\0') continue;
  314. if(lp->fieldcnt == lp->fieldalloc)
  315. {
  316. lp->fieldalloc += 50;
  317. if((lp->fields = realloc(lp->fields, lp->fieldalloc * sizeof(char *))) == NULL)
  318. err(1, NULL);
  319. }
  320. lp->fields[lp->fieldcnt++] = fieldp;
  321. }
  322. /* See if the join field value has changed. */
  323. if(lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf))
  324. {
  325. F->pushbool = 1;
  326. F->pushback = F->setcnt;
  327. break;
  328. }
  329. }
  330. }
  331. static char *
  332. mbssep(char **stringp, const wchar_t *delim)
  333. {
  334. char *s, *tok;
  335. const wchar_t *spanp;
  336. wchar_t c, sc;
  337. size_t n;
  338. if((s = *stringp) == NULL) return (NULL);
  339. for(tok = s;;)
  340. {
  341. n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
  342. if(n == (size_t)-1 || n == (size_t)-2)
  343. {
  344. fprintf(stderr, "join: error: mbrtowc: Illegal byte sequence\n");
  345. exit(1);
  346. }
  347. s += n;
  348. spanp = delim;
  349. do
  350. {
  351. if((sc = *spanp++) == c)
  352. {
  353. if(c == 0)
  354. s = NULL;
  355. else
  356. s[-n] = '\0';
  357. *stringp = s;
  358. return (tok);
  359. }
  360. } while(sc != 0);
  361. }
  362. }
  363. static int
  364. cmp(LINE *lp1, unsigned long fieldno1, LINE *lp2, unsigned long fieldno2)
  365. {
  366. if(lp1->fieldcnt <= fieldno1) return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
  367. if(lp2->fieldcnt <= fieldno2) return (-1);
  368. return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
  369. }
  370. static int
  371. mbscoll(const char *s1, const char *s2)
  372. {
  373. wchar_t *w1, *w2;
  374. int ret;
  375. if(MB_CUR_MAX == 1) return (strcoll(s1, s2));
  376. if((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) err(1, NULL); /* XXX */
  377. ret = wcscoll(w1, w2);
  378. free(w1);
  379. free(w2);
  380. return (ret);
  381. }
  382. static wchar_t *
  383. towcs(const char *s)
  384. {
  385. wchar_t *wcs;
  386. size_t n;
  387. if((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) return (NULL);
  388. if((wcs = calloc((n + 1), sizeof(*wcs))) == NULL) return (NULL);
  389. mbsrtowcs(wcs, &s, n + 1, NULL);
  390. return (wcs);
  391. }
  392. static void
  393. joinlines(INPUT *F1, INPUT *F2)
  394. {
  395. unsigned long cnt1, cnt2;
  396. /*
  397. * Output the results of a join comparison. The output may be from
  398. * either file 1 or file 2 (in which case the first argument is the
  399. * file from which to output) or from both.
  400. */
  401. if(F2 == NULL)
  402. {
  403. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  404. outoneline(F1, &F1->set[cnt1]);
  405. return;
  406. }
  407. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  408. for(cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
  409. outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
  410. }
  411. static void
  412. outoneline(INPUT *F, LINE *lp)
  413. {
  414. unsigned long cnt;
  415. /*
  416. * Output a single line from one of the files, according to the
  417. * join rules. This happens when we are writing unmatched single
  418. * lines. Output empty fields in the right places.
  419. */
  420. if(olist)
  421. for(cnt = 0; cnt < olistcnt; ++cnt)
  422. {
  423. if(olist[cnt].filenum == (unsigned)F->number)
  424. outfield(lp, olist[cnt].fieldno, 0);
  425. else if(olist[cnt].filenum == 0)
  426. outfield(lp, F->joinf, 0);
  427. else
  428. outfield(lp, 0, 1);
  429. }
  430. else
  431. {
  432. /*
  433. * Output the join field, then the remaining fields.
  434. */
  435. outfield(lp, F->joinf, 0);
  436. for(cnt = 0; cnt < lp->fieldcnt; ++cnt)
  437. if(F->joinf != cnt) outfield(lp, cnt, 0);
  438. }
  439. (void)printf("\n");
  440. if(ferror(stdout)) err(1, "error: stdout");
  441. needsep = 0;
  442. }
  443. static void
  444. outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
  445. {
  446. unsigned long cnt;
  447. /* Output a pair of lines according to the join list (if any). */
  448. if(olist)
  449. for(cnt = 0; cnt < olistcnt; ++cnt)
  450. if(olist[cnt].filenum == 0)
  451. {
  452. if(lp1->fieldcnt >= F1->joinf)
  453. outfield(lp1, F1->joinf, 0);
  454. else
  455. outfield(lp2, F2->joinf, 0);
  456. }
  457. else if(olist[cnt].filenum == 1)
  458. outfield(lp1, olist[cnt].fieldno, 0);
  459. else /* if (olist[cnt].filenum == 2) */
  460. outfield(lp2, olist[cnt].fieldno, 0);
  461. else
  462. {
  463. /*
  464. * Output the join field, then the remaining fields from F1
  465. * and F2.
  466. */
  467. outfield(lp1, F1->joinf, 0);
  468. for(cnt = 0; cnt < lp1->fieldcnt; ++cnt)
  469. if(F1->joinf != cnt) outfield(lp1, cnt, 0);
  470. for(cnt = 0; cnt < lp2->fieldcnt; ++cnt)
  471. if(F2->joinf != cnt) outfield(lp2, cnt, 0);
  472. }
  473. (void)printf("\n");
  474. if(ferror(stdout)) err(1, "error: stdout");
  475. needsep = 0;
  476. }
  477. static void
  478. outfield(LINE *lp, unsigned long fieldno, int out_empty)
  479. {
  480. if(needsep++) (void)printf("%lc", (wint_t)*tabchar);
  481. if(!ferror(stdout))
  482. {
  483. if(lp->fieldcnt <= fieldno || out_empty)
  484. {
  485. if(empty != NULL) (void)printf("%s", empty);
  486. }
  487. else
  488. {
  489. if(*lp->fields[fieldno] == '\0') return;
  490. (void)printf("%s", lp->fields[fieldno]);
  491. }
  492. }
  493. if(ferror(stdout)) err(1, "error: stdout");
  494. }
  495. /*
  496. * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
  497. * fields.
  498. */
  499. static void
  500. fieldarg(char *option)
  501. {
  502. unsigned long fieldno, filenum;
  503. char *end, *token;
  504. while((token = strsep(&option, ", \t")) != NULL)
  505. {
  506. if(*token == '\0') continue;
  507. if(token[0] == '0')
  508. filenum = fieldno = 0;
  509. else if((token[0] == '1' || token[0] == '2') && token[1] == '.')
  510. {
  511. filenum = token[0] - '0';
  512. fieldno = strtol(token + 2, &end, 10);
  513. if(*end) errx(1, "error: malformed -o option field");
  514. if(fieldno == 0) errx(1, "error: field numbers are 1 based");
  515. --fieldno;
  516. }
  517. else
  518. errx(1, "error: malformed -o option field");
  519. if(olistcnt == olistalloc)
  520. {
  521. olistalloc += 50;
  522. if((olist = realloc(olist, olistalloc * sizeof(OLIST))) == NULL) err(1, NULL);
  523. }
  524. olist[olistcnt].filenum = filenum;
  525. olist[olistcnt].fieldno = fieldno;
  526. ++olistcnt;
  527. }
  528. }
  529. static void
  530. usage(void)
  531. {
  532. (void)fprintf(stderr,
  533. "%s %s\n%s\n",
  534. "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
  535. "[-2 field]",
  536. " [-o list] [-t char] file1 file2");
  537. exit(1);
  538. }