logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/

join.c (15280B)


  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993, 1994
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * Steve Hayman of the Computer Science Department, Indiana University,
  9. * Michiro Hikida and David Goodenough.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. */
  35. #define _BSD_SOURCE // fgetln, strsep
  36. #include "../lib/err.h"
  37. #include <limits.h>
  38. #include <locale.h>
  39. #include <stdio.h> // fgetln
  40. #include <stdlib.h>
  41. #include <string.h> // strsep
  42. #include <sys/param.h>
  43. #include <unistd.h>
  44. #include <wchar.h>
  45. /*
  46. * There's a structure per input file which encapsulates the state of the
  47. * file. We repeatedly read lines from each file until we've read in all
  48. * the consecutive lines from the file with a common join field. Then we
  49. * compare the set of lines with an equivalent set from the other file.
  50. */
  51. typedef struct
  52. {
  53. char *line; /* line */
  54. unsigned long linealloc; /* line allocated count */
  55. char **fields; /* line field(s) */
  56. unsigned long fieldcnt; /* line field(s) count */
  57. unsigned long fieldalloc; /* line field(s) allocated count */
  58. } LINE;
  59. typedef struct
  60. {
  61. FILE *fp; /* file descriptor */
  62. unsigned long joinf; /* join field (-1, -2, -j) */
  63. int unpair; /* output unpairable lines (-a) */
  64. unsigned long number; /* 1 for file 1, 2 for file 2 */
  65. LINE *set; /* set of lines with same field */
  66. int pushbool; /* if pushback is set */
  67. unsigned long pushback; /* line on the stack */
  68. unsigned long setcnt; /* set count */
  69. unsigned long setalloc; /* set allocated count */
  70. } INPUT;
  71. static INPUT input1 = {NULL, 0, 0, 1, NULL, 0, 0, 0, 0}, input2 = {NULL, 0, 0, 2, NULL, 0, 0, 0, 0};
  72. typedef struct
  73. {
  74. unsigned long filenum; /* file number */
  75. unsigned long fieldno; /* field number */
  76. } OLIST;
  77. static OLIST *olist; /* output field list */
  78. static unsigned long olistcnt; /* output field list count */
  79. static unsigned long olistalloc; /* output field allocated count */
  80. static int joinout = 1; /* show lines with matched join fields (-v) */
  81. static int needsep; /* need separator character */
  82. static int spans = 1; /* span multiple delimiters (-t) */
  83. static char *empty; /* empty field replacement string (-e) */
  84. static wchar_t default_tabchar[] = L" \t";
  85. static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */
  86. static int cmp(LINE *, unsigned long, LINE *, unsigned long);
  87. static void fieldarg(char *);
  88. static void joinlines(INPUT *, INPUT *);
  89. static int mbscoll(const char *, const char *);
  90. static char *mbssep(char **, const wchar_t *);
  91. static void outfield(LINE *, unsigned long, int);
  92. static void outoneline(INPUT *, LINE *);
  93. static void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
  94. static void slurp(INPUT *);
  95. static wchar_t *towcs(const char *);
  96. static void usage(void);
  97. const char *argv0 = "join";
  98. int
  99. main(int argc, char *argv[])
  100. {
  101. INPUT *F1, *F2;
  102. int aflag, ch, cval, vflag;
  103. char *end;
  104. setlocale(LC_ALL, "");
  105. F1 = &input1;
  106. F2 = &input2;
  107. aflag = vflag = 0;
  108. while((ch = getopt(argc, argv, ":a:e:j:1:2:o:t:v:")) != -1)
  109. {
  110. switch(ch)
  111. {
  112. case '1':
  113. if((F1->joinf = strtol(optarg, &end, 10)) < 1)
  114. utils_errx(1, "-1 option field number less than 1");
  115. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  116. --F1->joinf;
  117. break;
  118. case '2':
  119. if((F2->joinf = strtol(optarg, &end, 10)) < 1)
  120. utils_errx(1, "-2 option field number less than 1");
  121. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  122. --F2->joinf;
  123. break;
  124. case 'a':
  125. aflag = 1;
  126. switch(strtol(optarg, &end, 10))
  127. {
  128. case 1:
  129. F1->unpair = 1;
  130. break;
  131. case 2:
  132. F2->unpair = 1;
  133. break;
  134. default:
  135. utils_errx(1, "-a option file number not 1 or 2");
  136. break;
  137. }
  138. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  139. break;
  140. case 'e':
  141. empty = optarg;
  142. break;
  143. case 'j':
  144. if((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
  145. utils_errx(1, "-j option field number less than 1");
  146. if(*end) utils_errx(1, "illegal field number -- %s", optarg);
  147. --F1->joinf;
  148. --F2->joinf;
  149. break;
  150. case 'o':
  151. fieldarg(optarg);
  152. break;
  153. case 't':
  154. spans = 0;
  155. if(mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != strlen(optarg))
  156. utils_errx(1, "illegal tab character specification");
  157. tabchar[1] = L'\0';
  158. break;
  159. case 'v':
  160. vflag = 1;
  161. joinout = 0;
  162. switch(strtol(optarg, &end, 10))
  163. {
  164. case 1:
  165. F1->unpair = 1;
  166. break;
  167. case 2:
  168. F2->unpair = 1;
  169. break;
  170. default:
  171. utils_errx(1, "-v option file number not 1 or 2");
  172. break;
  173. }
  174. if(*end) utils_errx(1, "illegal file number -- %s", optarg);
  175. break;
  176. case ':':
  177. fprintf(stderr, "%s: error: Missing operand for option: '-%c'\n", argv0, optopt);
  178. usage();
  179. return 1;
  180. case '?':
  181. fprintf(stderr, "%s: error: Unrecognised option: '-%c'\n", argv0, optopt);
  182. usage();
  183. return 1;
  184. default:
  185. abort();
  186. }
  187. }
  188. argc -= optind;
  189. argv += optind;
  190. if(aflag && vflag) utils_errx(1, "the -a and -v options are mutually exclusive");
  191. if(argc != 2)
  192. {
  193. fprintf(stderr, "%s: error: Need exactly 2 file operands, got %d instead\n", argv0, argc);
  194. usage();
  195. }
  196. /* Open the files; "-" means stdin. */
  197. if(!strcmp(*argv, "-"))
  198. F1->fp = stdin;
  199. else if((F1->fp = fopen(*argv, "r")) == NULL)
  200. utils_err(1, "%s", *argv);
  201. ++argv;
  202. if(!strcmp(*argv, "-"))
  203. F2->fp = stdin;
  204. else if((F2->fp = fopen(*argv, "r")) == NULL)
  205. utils_err(1, "%s", *argv);
  206. if(F1->fp == stdin && F2->fp == stdin) utils_errx(1, "only one input file may be stdin");
  207. slurp(F1);
  208. slurp(F2);
  209. while(F1->setcnt && F2->setcnt)
  210. {
  211. cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
  212. if(cval == 0)
  213. {
  214. /* Oh joy, oh rapture, oh beauty divine! */
  215. if(joinout) joinlines(F1, F2);
  216. slurp(F1);
  217. slurp(F2);
  218. }
  219. else if(cval < 0)
  220. {
  221. /* File 1 takes the lead... */
  222. if(F1->unpair) joinlines(F1, NULL);
  223. slurp(F1);
  224. }
  225. else
  226. {
  227. /* File 2 takes the lead... */
  228. if(F2->unpair) joinlines(F2, NULL);
  229. slurp(F2);
  230. }
  231. }
  232. /*
  233. * Now that one of the files is used up, optionally output any
  234. * remaining lines from the other file.
  235. */
  236. if(F1->unpair)
  237. while(F1->setcnt)
  238. {
  239. joinlines(F1, NULL);
  240. slurp(F1);
  241. }
  242. if(F2->unpair)
  243. while(F2->setcnt)
  244. {
  245. joinlines(F2, NULL);
  246. slurp(F2);
  247. }
  248. exit(0);
  249. }
  250. static void
  251. slurp(INPUT *F)
  252. {
  253. LINE *lp, *lastlp, tmp;
  254. size_t len;
  255. int cnt;
  256. char *bp, *fieldp;
  257. /*
  258. * Read all of the lines from an input file that have the same
  259. * join field.
  260. */
  261. F->setcnt = 0;
  262. for(lastlp = NULL;; ++F->setcnt)
  263. {
  264. /*
  265. * If we're out of space to hold line structures, allocate
  266. * more. Initialize the structure so that we know that this
  267. * is new space.
  268. */
  269. if(F->setcnt == F->setalloc)
  270. {
  271. cnt = F->setalloc;
  272. F->setalloc += 50;
  273. if((F->set = realloc(F->set, F->setalloc * sizeof(LINE))) == NULL) utils_err(1, NULL);
  274. memset(F->set + cnt, 0, 50 * sizeof(LINE));
  275. /* re-set lastlp in case it moved */
  276. if(lastlp != NULL) lastlp = &F->set[F->setcnt - 1];
  277. }
  278. /*
  279. * Get any pushed back line, else get the next line. Allocate
  280. * space as necessary. If taking the line from the stack swap
  281. * the two structures so that we don't lose space allocated to
  282. * either structure. This could be avoided by doing another
  283. * level of indirection, but it's probably okay as is.
  284. */
  285. lp = &F->set[F->setcnt];
  286. if(F->setcnt) lastlp = &F->set[F->setcnt - 1];
  287. if(F->pushbool)
  288. {
  289. tmp = F->set[F->setcnt];
  290. F->set[F->setcnt] = F->set[F->pushback];
  291. F->set[F->pushback] = tmp;
  292. F->pushbool = 0;
  293. continue;
  294. }
  295. if((bp = fgetln(F->fp, &len)) == NULL) return;
  296. if(lp->linealloc <= len + 1)
  297. {
  298. lp->linealloc += MAX(100, len + 1 - lp->linealloc);
  299. if((lp->line = realloc(lp->line, lp->linealloc)) == NULL) utils_err(1, NULL);
  300. }
  301. memmove(lp->line, bp, len);
  302. /* Replace trailing newline, if it exists. */
  303. if(bp[len - 1] == '\n')
  304. lp->line[len - 1] = '\0';
  305. else
  306. lp->line[len] = '\0';
  307. bp = lp->line;
  308. /* Split the line into fields, allocate space as necessary. */
  309. lp->fieldcnt = 0;
  310. while((fieldp = mbssep(&bp, tabchar)) != NULL)
  311. {
  312. if(spans && *fieldp == '\0') continue;
  313. if(lp->fieldcnt == lp->fieldalloc)
  314. {
  315. lp->fieldalloc += 50;
  316. if((lp->fields = realloc(lp->fields, lp->fieldalloc * sizeof(char *))) == NULL)
  317. utils_err(1, NULL);
  318. }
  319. lp->fields[lp->fieldcnt++] = fieldp;
  320. }
  321. /* See if the join field value has changed. */
  322. if(lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf))
  323. {
  324. F->pushbool = 1;
  325. F->pushback = F->setcnt;
  326. break;
  327. }
  328. }
  329. }
  330. static char *
  331. mbssep(char **stringp, const wchar_t *delim)
  332. {
  333. char *s, *tok;
  334. const wchar_t *spanp;
  335. wchar_t c, sc;
  336. size_t n;
  337. if((s = *stringp) == NULL) return (NULL);
  338. for(tok = s;;)
  339. {
  340. n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
  341. if(n == (size_t)-1 || n == (size_t)-2)
  342. {
  343. fprintf(stderr, "join: error: mbrtowc: Illegal byte sequence\n");
  344. exit(1);
  345. }
  346. s += n;
  347. spanp = delim;
  348. do
  349. {
  350. if((sc = *spanp++) == c)
  351. {
  352. if(c == 0)
  353. s = NULL;
  354. else
  355. s[-n] = '\0';
  356. *stringp = s;
  357. return (tok);
  358. }
  359. } while(sc != 0);
  360. }
  361. }
  362. static int
  363. cmp(LINE *lp1, unsigned long fieldno1, LINE *lp2, unsigned long fieldno2)
  364. {
  365. if(lp1->fieldcnt <= fieldno1) return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
  366. if(lp2->fieldcnt <= fieldno2) return (-1);
  367. return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
  368. }
  369. static int
  370. mbscoll(const char *s1, const char *s2)
  371. {
  372. wchar_t *w1, *w2;
  373. int ret;
  374. if(MB_CUR_MAX == 1) return (strcoll(s1, s2));
  375. if((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) utils_err(1, NULL); /* XXX */
  376. ret = wcscoll(w1, w2);
  377. free(w1);
  378. free(w2);
  379. return (ret);
  380. }
  381. static wchar_t *
  382. towcs(const char *s)
  383. {
  384. wchar_t *wcs;
  385. size_t n;
  386. if((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) return (NULL);
  387. if((wcs = calloc((n + 1), sizeof(*wcs))) == NULL) return (NULL);
  388. mbsrtowcs(wcs, &s, n + 1, NULL);
  389. return (wcs);
  390. }
  391. static void
  392. joinlines(INPUT *F1, INPUT *F2)
  393. {
  394. unsigned long cnt1, cnt2;
  395. /*
  396. * Output the results of a join comparison. The output may be from
  397. * either file 1 or file 2 (in which case the first argument is the
  398. * file from which to output) or from both.
  399. */
  400. if(F2 == NULL)
  401. {
  402. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  403. outoneline(F1, &F1->set[cnt1]);
  404. return;
  405. }
  406. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  407. for(cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
  408. outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
  409. }
  410. static void
  411. outoneline(INPUT *F, LINE *lp)
  412. {
  413. unsigned long cnt;
  414. /*
  415. * Output a single line from one of the files, according to the
  416. * join rules. This happens when we are writing unmatched single
  417. * lines. Output empty fields in the right places.
  418. */
  419. if(olist)
  420. for(cnt = 0; cnt < olistcnt; ++cnt)
  421. {
  422. if(olist[cnt].filenum == (unsigned)F->number)
  423. outfield(lp, olist[cnt].fieldno, 0);
  424. else if(olist[cnt].filenum == 0)
  425. outfield(lp, F->joinf, 0);
  426. else
  427. outfield(lp, 0, 1);
  428. }
  429. else
  430. {
  431. /*
  432. * Output the join field, then the remaining fields.
  433. */
  434. outfield(lp, F->joinf, 0);
  435. for(cnt = 0; cnt < lp->fieldcnt; ++cnt)
  436. if(F->joinf != cnt) outfield(lp, cnt, 0);
  437. }
  438. (void)printf("\n");
  439. if(ferror(stdout)) utils_err(1, "stdout");
  440. needsep = 0;
  441. }
  442. static void
  443. outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
  444. {
  445. unsigned long cnt;
  446. /* Output a pair of lines according to the join list (if any). */
  447. if(olist)
  448. for(cnt = 0; cnt < olistcnt; ++cnt)
  449. if(olist[cnt].filenum == 0)
  450. {
  451. if(lp1->fieldcnt >= F1->joinf)
  452. outfield(lp1, F1->joinf, 0);
  453. else
  454. outfield(lp2, F2->joinf, 0);
  455. }
  456. else if(olist[cnt].filenum == 1)
  457. outfield(lp1, olist[cnt].fieldno, 0);
  458. else /* if (olist[cnt].filenum == 2) */
  459. outfield(lp2, olist[cnt].fieldno, 0);
  460. else
  461. {
  462. /*
  463. * Output the join field, then the remaining fields from F1
  464. * and F2.
  465. */
  466. outfield(lp1, F1->joinf, 0);
  467. for(cnt = 0; cnt < lp1->fieldcnt; ++cnt)
  468. if(F1->joinf != cnt) outfield(lp1, cnt, 0);
  469. for(cnt = 0; cnt < lp2->fieldcnt; ++cnt)
  470. if(F2->joinf != cnt) outfield(lp2, cnt, 0);
  471. }
  472. (void)printf("\n");
  473. if(ferror(stdout)) utils_err(1, "stdout");
  474. needsep = 0;
  475. }
  476. static void
  477. outfield(LINE *lp, unsigned long fieldno, int out_empty)
  478. {
  479. if(needsep++) (void)printf("%lc", (wint_t)*tabchar);
  480. if(!ferror(stdout))
  481. {
  482. if(lp->fieldcnt <= fieldno || out_empty)
  483. {
  484. if(empty != NULL) (void)printf("%s", empty);
  485. }
  486. else
  487. {
  488. if(*lp->fields[fieldno] == '\0') return;
  489. (void)printf("%s", lp->fields[fieldno]);
  490. }
  491. }
  492. if(ferror(stdout)) utils_err(1, "stdout");
  493. }
  494. /*
  495. * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
  496. * fields.
  497. */
  498. static void
  499. fieldarg(char *option)
  500. {
  501. unsigned long fieldno, filenum;
  502. char *end, *token;
  503. while((token = strsep(&option, ", \t")) != NULL)
  504. {
  505. if(*token == '\0') continue;
  506. if(token[0] == '0')
  507. filenum = fieldno = 0;
  508. else if((token[0] == '1' || token[0] == '2') && token[1] == '.')
  509. {
  510. filenum = token[0] - '0';
  511. fieldno = strtol(token + 2, &end, 10);
  512. if(*end) utils_errx(1, "malformed -o option field");
  513. if(fieldno == 0) utils_errx(1, "field numbers are 1 based");
  514. --fieldno;
  515. }
  516. else
  517. utils_errx(1, "malformed -o option field");
  518. if(olistcnt == olistalloc)
  519. {
  520. olistalloc += 50;
  521. if((olist = realloc(olist, olistalloc * sizeof(OLIST))) == NULL) utils_err(1, NULL);
  522. }
  523. olist[olistcnt].filenum = filenum;
  524. olist[olistcnt].fieldno = fieldno;
  525. ++olistcnt;
  526. }
  527. }
  528. static void
  529. usage(void)
  530. {
  531. (void)fprintf(stderr,
  532. "%s %s\n%s\n",
  533. "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
  534. "[-2 field]",
  535. " [-o list] [-t char] file1 file2");
  536. exit(1);
  537. }