logo

utils-std

Collection of commonly available Unix tools git clone https://anongit.hacktivis.me/git/utils-std.git

tr_str.c (7430B)


  1. // SPDX-License-Identifier: BSD-3-Clause
  2. /* $OpenBSD: str.c,v 1.15 2023/05/04 16:08:29 tb Exp $ */
  3. /* $NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $ */
  4. /*-
  5. * Copyright (c) 1991, 1993
  6. * The Regents of the University of California. All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. Neither the name of the University nor the names of its contributors
  17. * may be used to endorse or promote products derived from this software
  18. * without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30. * SUCH DAMAGE.
  31. */
  32. #define _POSIX_C_SOURCE 202405L
  33. #include "./tr_str.h"
  34. #include "./reallocarray.h"
  35. #include <assert.h>
  36. #include <ctype.h>
  37. #include <err.h>
  38. #include <errno.h>
  39. #include <stddef.h>
  40. #include <stdio.h>
  41. #include <stdlib.h>
  42. #include <string.h>
  43. #include <sys/types.h>
  44. static int backslash(STR *);
  45. static int bracket(STR *);
  46. static int c_class(const void *, const void *);
  47. static void genclass(STR *);
  48. static void genequiv(STR *);
  49. static int genrange(STR *);
  50. static void genseq(STR *);
  51. int
  52. next(STR *s)
  53. {
  54. int ch;
  55. switch(s->state)
  56. {
  57. case EOS:
  58. return (0);
  59. case INFINITE:
  60. return (1);
  61. case NORMAL:
  62. switch(ch = *s->str)
  63. {
  64. case '\0':
  65. s->state = EOS;
  66. return (0);
  67. case '\\':
  68. s->lastch = backslash(s);
  69. break;
  70. case '[':
  71. if(bracket(s)) return (next(s));
  72. /* FALLTHROUGH */
  73. default:
  74. ++s->str;
  75. s->lastch = ch;
  76. break;
  77. }
  78. /* We can start a range at any time. */
  79. if(s->str[0] == '-' && genrange(s)) return (next(s));
  80. return (1);
  81. case RANGE:
  82. if(s->cnt-- == 0)
  83. {
  84. s->state = NORMAL;
  85. return (next(s));
  86. }
  87. ++s->lastch;
  88. return (1);
  89. case SEQUENCE:
  90. if(s->cnt-- == 0)
  91. {
  92. s->state = NORMAL;
  93. return (next(s));
  94. }
  95. return (1);
  96. case SET:
  97. if((s->lastch = s->set[s->cnt++]) == OOBCH)
  98. {
  99. s->state = NORMAL;
  100. return (next(s));
  101. }
  102. return (1);
  103. default:
  104. return 0;
  105. }
  106. /* NOTREACHED */
  107. }
  108. static int
  109. bracket(STR *s)
  110. {
  111. char *p;
  112. switch(s->str[1])
  113. {
  114. case ':': /* "[:class:]" */
  115. if((p = strstr((char *)s->str + 2, ":]")) == NULL) return (0);
  116. *p = '\0';
  117. s->str += 2;
  118. genclass(s);
  119. s->str = (unsigned char *)p + 2;
  120. return (1);
  121. case '=': /* "[=equiv=]" */
  122. if(strstr((char *)s->str + 2, "=]") == NULL) return (0);
  123. s->str += 2;
  124. genequiv(s);
  125. return (1);
  126. default: /* "[\###*n]" or "[#*n]" */
  127. if((p = strpbrk((char *)s->str + 2, "*]")) == NULL) return (0);
  128. if(p[0] != '*' || strchr(p, ']') == NULL) return (0);
  129. s->str += 1;
  130. genseq(s);
  131. return (1);
  132. }
  133. /* NOTREACHED */
  134. }
  135. typedef struct
  136. {
  137. const char *name;
  138. int (*func)(int);
  139. int *set;
  140. } CLASS;
  141. static CLASS classes[] = {
  142. /* clang-format off */
  143. { "alnum", isalnum, NULL, },
  144. { "alpha", isalpha, NULL, },
  145. { "blank", isblank, NULL, },
  146. { "cntrl", iscntrl, NULL, },
  147. { "digit", isdigit, NULL, },
  148. { "graph", isgraph, NULL, },
  149. { "lower", islower, NULL, },
  150. { "print", isprint, NULL, },
  151. { "punct", ispunct, NULL, },
  152. { "space", isspace, NULL, },
  153. { "upper", isupper, NULL, },
  154. { "xdigit", isxdigit, NULL, },
  155. /* clang-format on */
  156. };
  157. static void
  158. genclass(STR *s)
  159. {
  160. CLASS *cp, tmp;
  161. size_t len;
  162. int i;
  163. tmp.name = (char *)s->str;
  164. if((cp = (CLASS *)bsearch(
  165. &tmp, classes, sizeof(classes) / sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
  166. errx(1, "unknown class %s", s->str);
  167. /*
  168. * Generate the set of characters in the class if we haven't
  169. * already done so.
  170. */
  171. if(cp->set == NULL)
  172. {
  173. len = NCHARS + 1;
  174. assert(len != 0);
  175. cp->set = reallocarray(NULL, len, sizeof(*cp->set));
  176. if(cp->set == NULL) err(1, NULL);
  177. len = 0;
  178. for(i = 0; i < NCHARS; i++)
  179. {
  180. if(cp->func(i)) cp->set[len++] = i;
  181. }
  182. cp->set[len++] = OOBCH;
  183. assert(len != 0);
  184. cp->set = reallocarray(cp->set, len, sizeof(*cp->set));
  185. if(cp->set == NULL) err(1, NULL);
  186. }
  187. s->cnt = 0;
  188. s->state = SET;
  189. s->set = cp->set;
  190. }
  191. static int
  192. c_class(const void *a, const void *b)
  193. {
  194. return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
  195. }
  196. /*
  197. * English doesn't have any equivalence classes, so for now
  198. * we just syntax check and grab the character.
  199. */
  200. static void
  201. genequiv(STR *s)
  202. {
  203. if(*s->str == '\\')
  204. {
  205. s->equiv[0] = backslash(s);
  206. if(*s->str != '=') errx(1, "misplaced equivalence equals sign");
  207. }
  208. else
  209. {
  210. s->equiv[0] = s->str[0];
  211. if(s->str[1] != '=') errx(1, "misplaced equivalence equals sign");
  212. }
  213. s->str += 2;
  214. s->cnt = 0;
  215. s->state = SET;
  216. s->set = s->equiv;
  217. }
  218. static int
  219. genrange(STR *s)
  220. {
  221. int stopval;
  222. unsigned char *savestart;
  223. savestart = s->str;
  224. stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
  225. if(stopval < (unsigned char)s->lastch)
  226. {
  227. s->str = savestart;
  228. return (0);
  229. }
  230. s->cnt = stopval - s->lastch + 1;
  231. s->state = RANGE;
  232. --s->lastch;
  233. return (1);
  234. }
  235. static void
  236. genseq(STR *s)
  237. {
  238. char *ep;
  239. if(s->which == STRING1) errx(1, "sequences only valid in string2");
  240. if(*s->str == '\\')
  241. s->lastch = backslash(s);
  242. else
  243. s->lastch = *s->str++;
  244. if(*s->str != '*') errx(1, "misplaced sequence asterisk");
  245. switch(*++s->str)
  246. {
  247. case '\\':
  248. s->cnt = backslash(s);
  249. break;
  250. case ']':
  251. s->cnt = 0;
  252. ++s->str;
  253. break;
  254. default:
  255. if(isdigit(*s->str))
  256. {
  257. s->cnt = strtol((char *)s->str, &ep, 0);
  258. if(*ep == ']')
  259. {
  260. s->str = (unsigned char *)ep + 1;
  261. break;
  262. }
  263. }
  264. errx(1, "illegal sequence count");
  265. /* NOTREACHED */
  266. }
  267. s->state = s->cnt ? SEQUENCE : INFINITE;
  268. }
  269. /*
  270. * Translate \??? into a character. Up to 3 octal digits, if no digits either
  271. * an escape code or a literal character.
  272. */
  273. static int
  274. backslash(STR *s)
  275. {
  276. size_t i;
  277. int ch, val;
  278. assert(*s->str == '\\');
  279. s->str++;
  280. /* Empty escapes become plain backslashes. */
  281. if(*s->str == '\0')
  282. {
  283. s->state = EOS;
  284. return ('\\');
  285. }
  286. val = 0;
  287. for(i = 0; i < 3; i++)
  288. {
  289. if(s->str[i] < '0' || '7' < s->str[i]) break;
  290. val = val * 8 + s->str[i] - '0';
  291. }
  292. if(i > 0)
  293. {
  294. if(val > UCHAR_MAX) errx(1, "octal value out of range: %d", val);
  295. s->str += i;
  296. return (val);
  297. }
  298. ch = *s->str++;
  299. switch(ch)
  300. {
  301. case 'a': /* escape characters */
  302. return ('\7');
  303. case 'b':
  304. return ('\b');
  305. case 'f':
  306. return ('\f');
  307. case 'n':
  308. return ('\n');
  309. case 'r':
  310. return ('\r');
  311. case 't':
  312. return ('\t');
  313. case 'v':
  314. return ('\13');
  315. default: /* \x" -> x */
  316. return (ch);
  317. }
  318. }