logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git

join.c (14726B)


  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1991, 1993, 1994
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * This code is derived from software contributed to Berkeley by
  8. * Steve Hayman of the Computer Science Department, Indiana University,
  9. * Michiro Hikida and David Goodenough.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions
  13. * are met:
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. * 2. Redistributions in binary form must reproduce the above copyright
  17. * notice, this list of conditions and the following disclaimer in the
  18. * documentation and/or other materials provided with the distribution.
  19. * 3. Neither the name of the University nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33. * SUCH DAMAGE.
  34. */
  35. #define _BSD_SOURCE
  36. #include <err.h>
  37. #include <errno.h>
  38. #include <limits.h>
  39. #include <locale.h>
  40. #include <stdio.h>
  41. #include <stdlib.h>
  42. #include <string.h>
  43. #include <sys/param.h>
  44. #include <unistd.h>
  45. #include <wchar.h>
  46. /*
  47. * There's a structure per input file which encapsulates the state of the
  48. * file. We repeatedly read lines from each file until we've read in all
  49. * the consecutive lines from the file with a common join field. Then we
  50. * compare the set of lines with an equivalent set from the other file.
  51. */
  52. typedef struct
  53. {
  54. char *line; /* line */
  55. unsigned long linealloc; /* line allocated count */
  56. char **fields; /* line field(s) */
  57. unsigned long fieldcnt; /* line field(s) count */
  58. unsigned long fieldalloc; /* line field(s) allocated count */
  59. } LINE;
  60. typedef struct
  61. {
  62. FILE *fp; /* file descriptor */
  63. unsigned long joinf; /* join field (-1, -2, -j) */
  64. int unpair; /* output unpairable lines (-a) */
  65. unsigned long number; /* 1 for file 1, 2 for file 2 */
  66. LINE *set; /* set of lines with same field */
  67. int pushbool; /* if pushback is set */
  68. unsigned long pushback; /* line on the stack */
  69. unsigned long setcnt; /* set count */
  70. unsigned long setalloc; /* set allocated count */
  71. } INPUT;
  72. static INPUT input1 = {NULL, 0, 0, 1, NULL, 0, 0, 0, 0}, input2 = {NULL, 0, 0, 2, NULL, 0, 0, 0, 0};
  73. typedef struct
  74. {
  75. unsigned long filenum; /* file number */
  76. unsigned long fieldno; /* field number */
  77. } OLIST;
  78. static OLIST *olist; /* output field list */
  79. static unsigned long olistcnt; /* output field list count */
  80. static unsigned long olistalloc; /* output field allocated count */
  81. static int joinout = 1; /* show lines with matched join fields (-v) */
  82. static int needsep; /* need separator character */
  83. static int spans = 1; /* span multiple delimiters (-t) */
  84. static char *empty; /* empty field replacement string (-e) */
  85. static wchar_t default_tabchar[] = L" \t";
  86. static wchar_t *tabchar = default_tabchar; /* delimiter characters (-t) */
  87. static int cmp(LINE *, unsigned long, LINE *, unsigned long);
  88. static void fieldarg(char *);
  89. static void joinlines(INPUT *, INPUT *);
  90. static int mbscoll(const char *, const char *);
  91. static char *mbssep(char **, const wchar_t *);
  92. static void outfield(LINE *, unsigned long, int);
  93. static void outoneline(INPUT *, LINE *);
  94. static void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
  95. static void slurp(INPUT *);
  96. static wchar_t *towcs(const char *);
  97. static void usage(void);
  98. int
  99. main(int argc, char *argv[])
  100. {
  101. INPUT *F1, *F2;
  102. int aflag, ch, cval, vflag;
  103. char *end;
  104. setlocale(LC_ALL, "");
  105. F1 = &input1;
  106. F2 = &input2;
  107. aflag = vflag = 0;
  108. while((ch = getopt(argc, argv, "a:e:j:1:2:o:t:v:")) != -1)
  109. {
  110. switch(ch)
  111. {
  112. case '1':
  113. if((F1->joinf = strtol(optarg, &end, 10)) < 1) errx(1, "-1 option field number less than 1");
  114. if(*end) errx(1, "illegal field number -- %s", optarg);
  115. --F1->joinf;
  116. break;
  117. case '2':
  118. if((F2->joinf = strtol(optarg, &end, 10)) < 1) errx(1, "-2 option field number less than 1");
  119. if(*end) errx(1, "illegal field number -- %s", optarg);
  120. --F2->joinf;
  121. break;
  122. case 'a':
  123. aflag = 1;
  124. switch(strtol(optarg, &end, 10))
  125. {
  126. case 1:
  127. F1->unpair = 1;
  128. break;
  129. case 2:
  130. F2->unpair = 1;
  131. break;
  132. default:
  133. errx(1, "-a option file number not 1 or 2");
  134. break;
  135. }
  136. if(*end) errx(1, "illegal file number -- %s", optarg);
  137. break;
  138. case 'e':
  139. empty = optarg;
  140. break;
  141. case 'j':
  142. if((F1->joinf = F2->joinf = strtol(optarg, &end, 10)) < 1)
  143. errx(1, "-j option field number less than 1");
  144. if(*end) errx(1, "illegal field number -- %s", optarg);
  145. --F1->joinf;
  146. --F2->joinf;
  147. break;
  148. case 'o':
  149. fieldarg(optarg);
  150. break;
  151. case 't':
  152. spans = 0;
  153. if(mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) != strlen(optarg))
  154. errx(1, "illegal tab character specification");
  155. tabchar[1] = L'\0';
  156. break;
  157. case 'v':
  158. vflag = 1;
  159. joinout = 0;
  160. switch(strtol(optarg, &end, 10))
  161. {
  162. case 1:
  163. F1->unpair = 1;
  164. break;
  165. case 2:
  166. F2->unpair = 1;
  167. break;
  168. default:
  169. errx(1, "-v option file number not 1 or 2");
  170. break;
  171. }
  172. if(*end) errx(1, "illegal file number -- %s", optarg);
  173. break;
  174. case '?':
  175. default:
  176. usage();
  177. }
  178. }
  179. argc -= optind;
  180. argv += optind;
  181. if(aflag && vflag) errx(1, "the -a and -v options are mutually exclusive");
  182. if(argc != 2) usage();
  183. /* Open the files; "-" means stdin. */
  184. if(!strcmp(*argv, "-"))
  185. F1->fp = stdin;
  186. else if((F1->fp = fopen(*argv, "r")) == NULL)
  187. err(1, "%s", *argv);
  188. ++argv;
  189. if(!strcmp(*argv, "-"))
  190. F2->fp = stdin;
  191. else if((F2->fp = fopen(*argv, "r")) == NULL)
  192. err(1, "%s", *argv);
  193. if(F1->fp == stdin && F2->fp == stdin) errx(1, "only one input file may be stdin");
  194. slurp(F1);
  195. slurp(F2);
  196. while(F1->setcnt && F2->setcnt)
  197. {
  198. cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
  199. if(cval == 0)
  200. {
  201. /* Oh joy, oh rapture, oh beauty divine! */
  202. if(joinout) joinlines(F1, F2);
  203. slurp(F1);
  204. slurp(F2);
  205. }
  206. else if(cval < 0)
  207. {
  208. /* File 1 takes the lead... */
  209. if(F1->unpair) joinlines(F1, NULL);
  210. slurp(F1);
  211. }
  212. else
  213. {
  214. /* File 2 takes the lead... */
  215. if(F2->unpair) joinlines(F2, NULL);
  216. slurp(F2);
  217. }
  218. }
  219. /*
  220. * Now that one of the files is used up, optionally output any
  221. * remaining lines from the other file.
  222. */
  223. if(F1->unpair)
  224. while(F1->setcnt)
  225. {
  226. joinlines(F1, NULL);
  227. slurp(F1);
  228. }
  229. if(F2->unpair)
  230. while(F2->setcnt)
  231. {
  232. joinlines(F2, NULL);
  233. slurp(F2);
  234. }
  235. exit(0);
  236. }
  237. static void
  238. slurp(INPUT *F)
  239. {
  240. LINE *lp, *lastlp, tmp;
  241. size_t len;
  242. int cnt;
  243. char *bp, *fieldp;
  244. /*
  245. * Read all of the lines from an input file that have the same
  246. * join field.
  247. */
  248. F->setcnt = 0;
  249. for(lastlp = NULL;; ++F->setcnt)
  250. {
  251. /*
  252. * If we're out of space to hold line structures, allocate
  253. * more. Initialize the structure so that we know that this
  254. * is new space.
  255. */
  256. if(F->setcnt == F->setalloc)
  257. {
  258. cnt = F->setalloc;
  259. F->setalloc += 50;
  260. if((F->set = realloc(F->set, F->setalloc * sizeof(LINE))) == NULL) err(1, NULL);
  261. memset(F->set + cnt, 0, 50 * sizeof(LINE));
  262. /* re-set lastlp in case it moved */
  263. if(lastlp != NULL) lastlp = &F->set[F->setcnt - 1];
  264. }
  265. /*
  266. * Get any pushed back line, else get the next line. Allocate
  267. * space as necessary. If taking the line from the stack swap
  268. * the two structures so that we don't lose space allocated to
  269. * either structure. This could be avoided by doing another
  270. * level of indirection, but it's probably okay as is.
  271. */
  272. lp = &F->set[F->setcnt];
  273. if(F->setcnt) lastlp = &F->set[F->setcnt - 1];
  274. if(F->pushbool)
  275. {
  276. tmp = F->set[F->setcnt];
  277. F->set[F->setcnt] = F->set[F->pushback];
  278. F->set[F->pushback] = tmp;
  279. F->pushbool = 0;
  280. continue;
  281. }
  282. if((bp = fgetln(F->fp, &len)) == NULL) return;
  283. if(lp->linealloc <= len + 1)
  284. {
  285. lp->linealloc += MAX(100, len + 1 - lp->linealloc);
  286. if((lp->line = realloc(lp->line, lp->linealloc)) == NULL) err(1, NULL);
  287. }
  288. memmove(lp->line, bp, len);
  289. /* Replace trailing newline, if it exists. */
  290. if(bp[len - 1] == '\n')
  291. lp->line[len - 1] = '\0';
  292. else
  293. lp->line[len] = '\0';
  294. bp = lp->line;
  295. /* Split the line into fields, allocate space as necessary. */
  296. lp->fieldcnt = 0;
  297. while((fieldp = mbssep(&bp, tabchar)) != NULL)
  298. {
  299. if(spans && *fieldp == '\0') continue;
  300. if(lp->fieldcnt == lp->fieldalloc)
  301. {
  302. lp->fieldalloc += 50;
  303. if((lp->fields = realloc(lp->fields, lp->fieldalloc * sizeof(char *))) == NULL)
  304. err(1, NULL);
  305. }
  306. lp->fields[lp->fieldcnt++] = fieldp;
  307. }
  308. /* See if the join field value has changed. */
  309. if(lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf))
  310. {
  311. F->pushbool = 1;
  312. F->pushback = F->setcnt;
  313. break;
  314. }
  315. }
  316. }
  317. static char *
  318. mbssep(char **stringp, const wchar_t *delim)
  319. {
  320. char *s, *tok;
  321. const wchar_t *spanp;
  322. wchar_t c, sc;
  323. size_t n;
  324. if((s = *stringp) == NULL) return (NULL);
  325. for(tok = s;;)
  326. {
  327. n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
  328. if(n == (size_t)-1 || n == (size_t)-2)
  329. {
  330. fprintf(stderr, "join: mbrtowc: Illegal byte sequence\n");
  331. exit(1);
  332. }
  333. s += n;
  334. spanp = delim;
  335. do
  336. {
  337. if((sc = *spanp++) == c)
  338. {
  339. if(c == 0)
  340. s = NULL;
  341. else
  342. s[-n] = '\0';
  343. *stringp = s;
  344. return (tok);
  345. }
  346. } while(sc != 0);
  347. }
  348. }
  349. static int
  350. cmp(LINE *lp1, unsigned long fieldno1, LINE *lp2, unsigned long fieldno2)
  351. {
  352. if(lp1->fieldcnt <= fieldno1) return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
  353. if(lp2->fieldcnt <= fieldno2) return (-1);
  354. return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
  355. }
  356. static int
  357. mbscoll(const char *s1, const char *s2)
  358. {
  359. wchar_t *w1, *w2;
  360. int ret;
  361. if(MB_CUR_MAX == 1) return (strcoll(s1, s2));
  362. if((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL) err(1, NULL); /* XXX */
  363. ret = wcscoll(w1, w2);
  364. free(w1);
  365. free(w2);
  366. return (ret);
  367. }
  368. static wchar_t *
  369. towcs(const char *s)
  370. {
  371. wchar_t *wcs;
  372. size_t n;
  373. if((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1) return (NULL);
  374. if((wcs = calloc((n + 1), sizeof(*wcs))) == NULL) return (NULL);
  375. mbsrtowcs(wcs, &s, n + 1, NULL);
  376. return (wcs);
  377. }
  378. static void
  379. joinlines(INPUT *F1, INPUT *F2)
  380. {
  381. unsigned long cnt1, cnt2;
  382. /*
  383. * Output the results of a join comparison. The output may be from
  384. * either file 1 or file 2 (in which case the first argument is the
  385. * file from which to output) or from both.
  386. */
  387. if(F2 == NULL)
  388. {
  389. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  390. outoneline(F1, &F1->set[cnt1]);
  391. return;
  392. }
  393. for(cnt1 = 0; cnt1 < F1->setcnt; ++cnt1)
  394. for(cnt2 = 0; cnt2 < F2->setcnt; ++cnt2)
  395. outtwoline(F1, &F1->set[cnt1], F2, &F2->set[cnt2]);
  396. }
  397. static void
  398. outoneline(INPUT *F, LINE *lp)
  399. {
  400. unsigned long cnt;
  401. /*
  402. * Output a single line from one of the files, according to the
  403. * join rules. This happens when we are writing unmatched single
  404. * lines. Output empty fields in the right places.
  405. */
  406. if(olist)
  407. for(cnt = 0; cnt < olistcnt; ++cnt)
  408. {
  409. if(olist[cnt].filenum == (unsigned)F->number)
  410. outfield(lp, olist[cnt].fieldno, 0);
  411. else if(olist[cnt].filenum == 0)
  412. outfield(lp, F->joinf, 0);
  413. else
  414. outfield(lp, 0, 1);
  415. }
  416. else
  417. {
  418. /*
  419. * Output the join field, then the remaining fields.
  420. */
  421. outfield(lp, F->joinf, 0);
  422. for(cnt = 0; cnt < lp->fieldcnt; ++cnt)
  423. if(F->joinf != cnt) outfield(lp, cnt, 0);
  424. }
  425. (void)printf("\n");
  426. if(ferror(stdout)) err(1, "stdout");
  427. needsep = 0;
  428. }
  429. static void
  430. outtwoline(INPUT *F1, LINE *lp1, INPUT *F2, LINE *lp2)
  431. {
  432. unsigned long cnt;
  433. /* Output a pair of lines according to the join list (if any). */
  434. if(olist)
  435. for(cnt = 0; cnt < olistcnt; ++cnt)
  436. if(olist[cnt].filenum == 0)
  437. {
  438. if(lp1->fieldcnt >= F1->joinf)
  439. outfield(lp1, F1->joinf, 0);
  440. else
  441. outfield(lp2, F2->joinf, 0);
  442. }
  443. else if(olist[cnt].filenum == 1)
  444. outfield(lp1, olist[cnt].fieldno, 0);
  445. else /* if (olist[cnt].filenum == 2) */
  446. outfield(lp2, olist[cnt].fieldno, 0);
  447. else
  448. {
  449. /*
  450. * Output the join field, then the remaining fields from F1
  451. * and F2.
  452. */
  453. outfield(lp1, F1->joinf, 0);
  454. for(cnt = 0; cnt < lp1->fieldcnt; ++cnt)
  455. if(F1->joinf != cnt) outfield(lp1, cnt, 0);
  456. for(cnt = 0; cnt < lp2->fieldcnt; ++cnt)
  457. if(F2->joinf != cnt) outfield(lp2, cnt, 0);
  458. }
  459. (void)printf("\n");
  460. if(ferror(stdout)) err(1, "stdout");
  461. needsep = 0;
  462. }
  463. static void
  464. outfield(LINE *lp, unsigned long fieldno, int out_empty)
  465. {
  466. if(needsep++) (void)printf("%lc", (wint_t)*tabchar);
  467. if(!ferror(stdout))
  468. {
  469. if(lp->fieldcnt <= fieldno || out_empty)
  470. {
  471. if(empty != NULL) (void)printf("%s", empty);
  472. }
  473. else
  474. {
  475. if(*lp->fields[fieldno] == '\0') return;
  476. (void)printf("%s", lp->fields[fieldno]);
  477. }
  478. }
  479. if(ferror(stdout)) err(1, "stdout");
  480. }
  481. /*
  482. * Convert an output list argument "2.1, 1.3, 2.4" into an array of output
  483. * fields.
  484. */
  485. static void
  486. fieldarg(char *option)
  487. {
  488. unsigned long fieldno, filenum;
  489. char *end, *token;
  490. while((token = strsep(&option, ", \t")) != NULL)
  491. {
  492. if(*token == '\0') continue;
  493. if(token[0] == '0')
  494. filenum = fieldno = 0;
  495. else if((token[0] == '1' || token[0] == '2') && token[1] == '.')
  496. {
  497. filenum = token[0] - '0';
  498. fieldno = strtol(token + 2, &end, 10);
  499. if(*end) errx(1, "malformed -o option field");
  500. if(fieldno == 0) errx(1, "field numbers are 1 based");
  501. --fieldno;
  502. }
  503. else
  504. errx(1, "malformed -o option field");
  505. if(olistcnt == olistalloc)
  506. {
  507. olistalloc += 50;
  508. if((olist = realloc(olist, olistalloc * sizeof(OLIST))) == NULL) err(1, NULL);
  509. }
  510. olist[olistcnt].filenum = filenum;
  511. olist[olistcnt].fieldno = fieldno;
  512. ++olistcnt;
  513. }
  514. }
  515. static void
  516. usage(void)
  517. {
  518. (void)fprintf(stderr,
  519. "%s %s\n%s\n",
  520. "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
  521. "[-2 field]",
  522. " [-o list] [-t char] file1 file2");
  523. exit(1);
  524. }