logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/

tr_str.c (7587B)


  1. // SPDX-License-Identifier: BSD-3-Clause
  2. /* $OpenBSD: str.c,v 1.15 2023/05/04 16:08:29 tb Exp $ */
  3. /* $NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $ */
  4. /*-
  5. * Copyright (c) 1991, 1993
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. Neither the name of the University nor the names of its contributors
  17. * may be used to endorse or promote products derived from this software
  18. * without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30. * SUCH DAMAGE.
  31. */
  32. // clang-format off
  33. #define _POSIX_C_SOURCE 202405L
  34. // Needs to be the first included header due to this horrible BSD_VISIBLE macro
  35. #include "./reallocarray.h"
  36. #include "./tr_str.h"
  37. // clang-format on
  38. #include "./err.h"
  39. #include <assert.h>
  40. #include <ctype.h>
  41. #include <stddef.h>
  42. #include <stdio.h>
  43. #include <stdlib.h>
  44. #include <string.h>
  45. #include <sys/types.h>
  46. static int backslash(STR *);
  47. static int bracket(STR *);
  48. static int c_class(const void *, const void *);
  49. static void genclass(STR *);
  50. static void genequiv(STR *);
  51. static int genrange(STR *);
  52. static void genseq(STR *);
  53. int
  54. next(STR *s)
  55. {
  56. int ch;
  57. switch(s->state)
  58. {
  59. case EOS:
  60. return (0);
  61. case INFINITE:
  62. return (1);
  63. case NORMAL:
  64. switch(ch = *s->str)
  65. {
  66. case '\0':
  67. s->state = EOS;
  68. return (0);
  69. case '\\':
  70. s->lastch = backslash(s);
  71. break;
  72. case '[':
  73. if(bracket(s)) return (next(s));
  74. /* FALLTHROUGH */
  75. default:
  76. ++s->str;
  77. s->lastch = ch;
  78. break;
  79. }
  80. /* We can start a range at any time. */
  81. if(s->str[0] == '-' && genrange(s)) return (next(s));
  82. return (1);
  83. case RANGE:
  84. if(s->cnt-- == 0)
  85. {
  86. s->state = NORMAL;
  87. return (next(s));
  88. }
  89. ++s->lastch;
  90. return (1);
  91. case SEQUENCE:
  92. if(s->cnt-- == 0)
  93. {
  94. s->state = NORMAL;
  95. return (next(s));
  96. }
  97. return (1);
  98. case SET:
  99. if((s->lastch = s->set[s->cnt++]) == OOBCH)
  100. {
  101. s->state = NORMAL;
  102. return (next(s));
  103. }
  104. return (1);
  105. default:
  106. return 0;
  107. }
  108. /* NOTREACHED */
  109. }
  110. static int
  111. bracket(STR *s)
  112. {
  113. char *p;
  114. switch(s->str[1])
  115. {
  116. case ':': /* "[:class:]" */
  117. if((p = strstr((char *)s->str + 2, ":]")) == NULL) return (0);
  118. *p = '\0';
  119. s->str += 2;
  120. genclass(s);
  121. s->str = (unsigned char *)p + 2;
  122. return (1);
  123. case '=': /* "[=equiv=]" */
  124. if(strstr((char *)s->str + 2, "=]") == NULL) return (0);
  125. s->str += 2;
  126. genequiv(s);
  127. return (1);
  128. default: /* "[\###*n]" or "[#*n]" */
  129. if((p = strpbrk((char *)s->str + 2, "*]")) == NULL) return (0);
  130. if(p[0] != '*' || strchr(p, ']') == NULL) return (0);
  131. s->str += 1;
  132. genseq(s);
  133. return (1);
  134. }
  135. /* NOTREACHED */
  136. }
  137. typedef struct
  138. {
  139. const char *name;
  140. int (*func)(int);
  141. int *set;
  142. } CLASS;
  143. static CLASS classes[] = {
  144. /* clang-format off */
  145. { "alnum", isalnum, NULL, },
  146. { "alpha", isalpha, NULL, },
  147. { "blank", isblank, NULL, },
  148. { "cntrl", iscntrl, NULL, },
  149. { "digit", isdigit, NULL, },
  150. { "graph", isgraph, NULL, },
  151. { "lower", islower, NULL, },
  152. { "print", isprint, NULL, },
  153. { "punct", ispunct, NULL, },
  154. { "space", isspace, NULL, },
  155. { "upper", isupper, NULL, },
  156. { "xdigit", isxdigit, NULL, },
  157. /* clang-format on */
  158. };
  159. static void
  160. genclass(STR *s)
  161. {
  162. CLASS *cp, tmp;
  163. size_t len;
  164. int i;
  165. tmp.name = (char *)s->str;
  166. if((cp = (CLASS *)bsearch(
  167. &tmp, classes, sizeof(classes) / sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
  168. utils_errx(1, "unknown class %s", s->str);
  169. /*
  170. * Generate the set of characters in the class if we haven't
  171. * already done so.
  172. */
  173. if(cp->set == NULL)
  174. {
  175. len = NCHARS + 1;
  176. assert(len != 0);
  177. cp->set = reallocarray(NULL, len, sizeof(*cp->set));
  178. if(cp->set == NULL) utils_err(1, NULL);
  179. len = 0;
  180. for(i = 0; i < NCHARS; i++)
  181. {
  182. if(cp->func(i)) cp->set[len++] = i;
  183. }
  184. cp->set[len++] = OOBCH;
  185. assert(len != 0);
  186. cp->set = reallocarray(cp->set, len, sizeof(*cp->set));
  187. if(cp->set == NULL) utils_err(1, NULL);
  188. }
  189. s->cnt = 0;
  190. s->state = SET;
  191. s->set = cp->set;
  192. }
  193. static int
  194. c_class(const void *a, const void *b)
  195. {
  196. return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
  197. }
  198. /*
  199. * English doesn't have any equivalence classes, so for now
  200. * we just syntax check and grab the character.
  201. */
  202. static void
  203. genequiv(STR *s)
  204. {
  205. if(*s->str == '\\')
  206. {
  207. s->equiv[0] = backslash(s);
  208. if(*s->str != '=') utils_errx(1, "misplaced equivalence equals sign");
  209. }
  210. else
  211. {
  212. s->equiv[0] = s->str[0];
  213. if(s->str[1] != '=') utils_errx(1, "misplaced equivalence equals sign");
  214. }
  215. s->str += 2;
  216. s->cnt = 0;
  217. s->state = SET;
  218. s->set = s->equiv;
  219. }
  220. static int
  221. genrange(STR *s)
  222. {
  223. int stopval;
  224. unsigned char *savestart;
  225. savestart = s->str;
  226. stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
  227. if(stopval < (unsigned char)s->lastch)
  228. {
  229. s->str = savestart;
  230. return (0);
  231. }
  232. s->cnt = stopval - s->lastch + 1;
  233. s->state = RANGE;
  234. --s->lastch;
  235. return (1);
  236. }
  237. static void
  238. genseq(STR *s)
  239. {
  240. char *ep;
  241. if(s->which == STRING1) utils_errx(1, "sequences only valid in string2");
  242. if(*s->str == '\\')
  243. s->lastch = backslash(s);
  244. else
  245. s->lastch = *s->str++;
  246. if(*s->str != '*') utils_errx(1, "misplaced sequence asterisk");
  247. switch(*++s->str)
  248. {
  249. case '\\':
  250. s->cnt = backslash(s);
  251. break;
  252. case ']':
  253. s->cnt = 0;
  254. ++s->str;
  255. break;
  256. default:
  257. if(isdigit(*s->str))
  258. {
  259. s->cnt = strtol((char *)s->str, &ep, 0);
  260. if(*ep == ']')
  261. {
  262. s->str = (unsigned char *)ep + 1;
  263. break;
  264. }
  265. }
  266. utils_errx(1, "illegal sequence count");
  267. /* NOTREACHED */
  268. }
  269. s->state = s->cnt ? SEQUENCE : INFINITE;
  270. }
  271. /*
  272. * Translate \??? into a character. Up to 3 octal digits, if no digits either
  273. * an escape code or a literal character.
  274. */
  275. static int
  276. backslash(STR *s)
  277. {
  278. size_t i;
  279. int ch, val;
  280. assert(*s->str == '\\');
  281. s->str++;
  282. /* Empty escapes become plain backslashes. */
  283. if(*s->str == '\0')
  284. {
  285. s->state = EOS;
  286. return ('\\');
  287. }
  288. val = 0;
  289. for(i = 0; i < 3; i++)
  290. {
  291. if(s->str[i] < '0' || '7' < s->str[i]) break;
  292. val = val * 8 + s->str[i] - '0';
  293. }
  294. if(i > 0)
  295. {
  296. if(val > UCHAR_MAX) utils_errx(1, "octal value out of range: %d", val);
  297. s->str += i;
  298. return (val);
  299. }
  300. ch = *s->str++;
  301. switch(ch)
  302. {
  303. case 'a': /* escape characters */
  304. return ('\7');
  305. case 'b':
  306. return ('\b');
  307. case 'f':
  308. return ('\f');
  309. case 'n':
  310. return ('\n');
  311. case 'r':
  312. return ('\r');
  313. case 't':
  314. return ('\t');
  315. case 'v':
  316. return ('\13');
  317. default: /* \x" -> x */
  318. return (ch);
  319. }
  320. }