logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git/

tr_str.c (7500B)


  1. // SPDX-License-Identifier: BSD-3-Clause
  2. /* $OpenBSD: str.c,v 1.15 2023/05/04 16:08:29 tb Exp $ */
  3. /* $NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $ */
  4. /*-
  5. * Copyright (c) 1991, 1993
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. Neither the name of the University nor the names of its contributors
  17. * may be used to endorse or promote products derived from this software
  18. * without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30. * SUCH DAMAGE.
  31. */
  32. #define _POSIX_C_SOURCE 202405L
  33. #include "./tr_str.h"
  34. #include "../libutils/err.h"
  35. #include "../libutils/reallocarray.h"
  36. #include <assert.h>
  37. #include <ctype.h>
  38. #include <stddef.h>
  39. #include <stdio.h>
  40. #include <stdlib.h>
  41. #include <string.h>
  42. #include <sys/types.h>
  43. static int backslash(STR *);
  44. static int bracket(STR *);
  45. static int c_class(const void *, const void *);
  46. static void genclass(STR *);
  47. static void genequiv(STR *);
  48. static int genrange(STR *);
  49. static void genseq(STR *);
  50. int
  51. next(STR *s)
  52. {
  53. int ch;
  54. switch(s->state)
  55. {
  56. case EOS:
  57. return (0);
  58. case INFINITE:
  59. return (1);
  60. case NORMAL:
  61. switch(ch = *s->str)
  62. {
  63. case '\0':
  64. s->state = EOS;
  65. return (0);
  66. case '\\':
  67. s->lastch = backslash(s);
  68. break;
  69. case '[':
  70. if(bracket(s)) return (next(s));
  71. /* FALLTHROUGH */
  72. default:
  73. ++s->str;
  74. s->lastch = ch;
  75. break;
  76. }
  77. /* We can start a range at any time. */
  78. if(s->str[0] == '-' && genrange(s)) return (next(s));
  79. return (1);
  80. case RANGE:
  81. if(s->cnt-- == 0)
  82. {
  83. s->state = NORMAL;
  84. return (next(s));
  85. }
  86. ++s->lastch;
  87. return (1);
  88. case SEQUENCE:
  89. if(s->cnt-- == 0)
  90. {
  91. s->state = NORMAL;
  92. return (next(s));
  93. }
  94. return (1);
  95. case SET:
  96. if((s->lastch = s->set[s->cnt++]) == OOBCH)
  97. {
  98. s->state = NORMAL;
  99. return (next(s));
  100. }
  101. return (1);
  102. default:
  103. return 0;
  104. }
  105. /* NOTREACHED */
  106. }
  107. static int
  108. bracket(STR *s)
  109. {
  110. char *p;
  111. switch(s->str[1])
  112. {
  113. case ':': /* "[:class:]" */
  114. if((p = strstr((char *)s->str + 2, ":]")) == NULL) return (0);
  115. *p = '\0';
  116. s->str += 2;
  117. genclass(s);
  118. s->str = (unsigned char *)p + 2;
  119. return (1);
  120. case '=': /* "[=equiv=]" */
  121. if(strstr((char *)s->str + 2, "=]") == NULL) return (0);
  122. s->str += 2;
  123. genequiv(s);
  124. return (1);
  125. default: /* "[\###*n]" or "[#*n]" */
  126. if((p = strpbrk((char *)s->str + 2, "*]")) == NULL) return (0);
  127. if(p[0] != '*' || strchr(p, ']') == NULL) return (0);
  128. s->str += 1;
  129. genseq(s);
  130. return (1);
  131. }
  132. /* NOTREACHED */
  133. }
  134. typedef struct
  135. {
  136. const char *name;
  137. int (*func)(int);
  138. int *set;
  139. } CLASS;
  140. static CLASS classes[] = {
  141. /* clang-format off */
  142. { "alnum", isalnum, NULL, },
  143. { "alpha", isalpha, NULL, },
  144. { "blank", isblank, NULL, },
  145. { "cntrl", iscntrl, NULL, },
  146. { "digit", isdigit, NULL, },
  147. { "graph", isgraph, NULL, },
  148. { "lower", islower, NULL, },
  149. { "print", isprint, NULL, },
  150. { "punct", ispunct, NULL, },
  151. { "space", isspace, NULL, },
  152. { "upper", isupper, NULL, },
  153. { "xdigit", isxdigit, NULL, },
  154. /* clang-format on */
  155. };
  156. static void
  157. genclass(STR *s)
  158. {
  159. CLASS *cp, tmp;
  160. size_t len;
  161. int i;
  162. tmp.name = (char *)s->str;
  163. if((cp = (CLASS *)bsearch(
  164. &tmp, classes, sizeof(classes) / sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
  165. utils_errx(1, "unknown class %s", s->str);
  166. /*
  167. * Generate the set of characters in the class if we haven't
  168. * already done so.
  169. */
  170. if(cp->set == NULL)
  171. {
  172. len = NCHARS + 1;
  173. assert(len != 0);
  174. cp->set = utils_reallocarray(NULL, len, sizeof(*cp->set));
  175. if(cp->set == NULL) utils_err(1, NULL);
  176. len = 0;
  177. for(i = 0; i < NCHARS; i++)
  178. {
  179. if(cp->func(i)) cp->set[len++] = i;
  180. }
  181. cp->set[len++] = OOBCH;
  182. assert(len != 0);
  183. cp->set = utils_reallocarray(cp->set, len, sizeof(*cp->set));
  184. if(cp->set == NULL) utils_err(1, NULL);
  185. }
  186. s->cnt = 0;
  187. s->state = SET;
  188. s->set = cp->set;
  189. }
  190. static int
  191. c_class(const void *a, const void *b)
  192. {
  193. return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
  194. }
  195. /*
  196. * English doesn't have any equivalence classes, so for now
  197. * we just syntax check and grab the character.
  198. */
  199. static void
  200. genequiv(STR *s)
  201. {
  202. if(*s->str == '\\')
  203. {
  204. s->equiv[0] = backslash(s);
  205. if(*s->str != '=') utils_errx(1, "misplaced equivalence equals sign");
  206. }
  207. else
  208. {
  209. s->equiv[0] = s->str[0];
  210. if(s->str[1] != '=') utils_errx(1, "misplaced equivalence equals sign");
  211. }
  212. s->str += 2;
  213. s->cnt = 0;
  214. s->state = SET;
  215. s->set = s->equiv;
  216. }
  217. static int
  218. genrange(STR *s)
  219. {
  220. int stopval;
  221. unsigned char *savestart;
  222. savestart = s->str;
  223. stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
  224. if(stopval < (unsigned char)s->lastch)
  225. {
  226. s->str = savestart;
  227. return (0);
  228. }
  229. s->cnt = stopval - s->lastch + 1;
  230. s->state = RANGE;
  231. --s->lastch;
  232. return (1);
  233. }
  234. static void
  235. genseq(STR *s)
  236. {
  237. char *ep;
  238. if(s->which == STRING1) utils_errx(1, "sequences only valid in string2");
  239. if(*s->str == '\\')
  240. s->lastch = backslash(s);
  241. else
  242. s->lastch = *s->str++;
  243. if(*s->str != '*') utils_errx(1, "misplaced sequence asterisk");
  244. switch(*++s->str)
  245. {
  246. case '\\':
  247. s->cnt = backslash(s);
  248. break;
  249. case ']':
  250. s->cnt = 0;
  251. ++s->str;
  252. break;
  253. default:
  254. if(isdigit(*s->str))
  255. {
  256. s->cnt = strtol((char *)s->str, &ep, 0);
  257. if(*ep == ']')
  258. {
  259. s->str = (unsigned char *)ep + 1;
  260. break;
  261. }
  262. }
  263. utils_errx(1, "illegal sequence count");
  264. /* NOTREACHED */
  265. }
  266. s->state = s->cnt ? SEQUENCE : INFINITE;
  267. }
  268. /*
  269. * Translate \??? into a character. Up to 3 octal digits, if no digits either
  270. * an escape code or a literal character.
  271. */
  272. static int
  273. backslash(STR *s)
  274. {
  275. size_t i;
  276. int ch, val;
  277. assert(*s->str == '\\');
  278. s->str++;
  279. /* Empty escapes become plain backslashes. */
  280. if(*s->str == '\0')
  281. {
  282. s->state = EOS;
  283. return ('\\');
  284. }
  285. val = 0;
  286. for(i = 0; i < 3; i++)
  287. {
  288. if(s->str[i] < '0' || '7' < s->str[i]) break;
  289. val = val * 8 + s->str[i] - '0';
  290. }
  291. if(i > 0)
  292. {
  293. if(val > UCHAR_MAX) utils_errx(1, "octal value out of range: %d", val);
  294. s->str += i;
  295. return (val);
  296. }
  297. ch = *s->str++;
  298. switch(ch)
  299. {
  300. case 'a': /* escape characters */
  301. return ('\7');
  302. case 'b':
  303. return ('\b');
  304. case 'f':
  305. return ('\f');
  306. case 'n':
  307. return ('\n');
  308. case 'r':
  309. return ('\r');
  310. case 't':
  311. return ('\t');
  312. case 'v':
  313. return ('\13');
  314. default: /* \x" -> x */
  315. return (ch);
  316. }
  317. }