logo

oasis-root

Compiled tree of Oasis Linux based on own branch at <https://hacktivis.me/git/oasis/> git clone https://anongit.hacktivis.me/git/oasis-root.git

awk.lua (9011B)


  1. -- Copyright 2006-2024 Mitchell. See LICENSE.
  2. -- AWK LPeg lexer.
  3. -- Modified by Wolfgang Seeberg 2012, 2013.
  4. local lexer = lexer
  5. local P, S = lpeg.P, lpeg.S
  6. local lex = lexer.new(...)
  7. local LEFTBRACKET = '['
  8. local RIGHTBRACKET = ']'
  9. local SLASH = '/'
  10. local BACKSLASH = '\\'
  11. local CARET = '^'
  12. local CR = '\r'
  13. local LF = '\n'
  14. local CRLF = CR .. LF
  15. local DQUOTE = '"'
  16. local DELIMITER_MATCHES = {['('] = ')', ['['] = ']'}
  17. local COMPANION = {['('] = '[', ['['] = '('}
  18. local CC = {
  19. alnum = 1, alpha = 1, blank = 1, cntrl = 1, digit = 1, graph = 1, lower = 1, print = 1, punct = 1,
  20. space = 1, upper = 1, xdigit = 1
  21. }
  22. local LastRegexEnd = 0
  23. local BackslashAtCommentEnd = 0
  24. local KW_BEFORE_RX = {
  25. case = 1, ['do'] = 1, ['else'] = 1, exit = 1, print = 1, printf = 1, ['return'] = 1
  26. }
  27. local function findKeyword(input, e)
  28. local i = e
  29. while i > 0 and input:find("^[%l]", i) do i = i - 1 end
  30. local w = input:sub(i + 1, e)
  31. if i == 0 then
  32. return KW_BEFORE_RX[w] == 1
  33. elseif input:find("^[%u%d_]", i) then
  34. return false
  35. else
  36. return KW_BEFORE_RX[w] == 1
  37. end
  38. end
  39. local function isRegex(input, i)
  40. while i >= 1 and input:find('^[ \t]', i) do i = i - 1 end
  41. if i < 1 then return true end
  42. if input:find("^[-!%%&(*+,:;<=>?[^{|}~\f]", i) or findKeyword(input, i) then
  43. return true
  44. elseif input:sub(i, i) == SLASH then
  45. return i ~= LastRegexEnd -- deals with /xx/ / /yy/.
  46. elseif input:find('^[]%w)."]', i) then
  47. return false
  48. elseif input:sub(i, i) == LF then
  49. if i == 1 then return true end
  50. i = i - 1
  51. if input:sub(i, i) == CR then
  52. if i == 1 then return true end
  53. i = i - 1
  54. end
  55. elseif input:sub(i, i) == CR then
  56. if i == 1 then return true end
  57. i = i - 1
  58. else
  59. return false
  60. end
  61. if input:sub(i, i) == BACKSLASH and i ~= BackslashAtCommentEnd then
  62. return isRegex(input, i - 1)
  63. else
  64. return true
  65. end
  66. end
  67. local function eatCharacterClass(input, s, e)
  68. local i = s
  69. while i <= e do
  70. if input:find('^[\r\n]', i) then
  71. return false
  72. elseif input:sub(i, i + 1) == ':]' then
  73. local str = input:sub(s, i - 1)
  74. return CC[str] == 1 and i + 1
  75. end
  76. i = i + 1
  77. end
  78. return false
  79. end
  80. local function eatBrackets(input, i, e)
  81. if input:sub(i, i) == CARET then i = i + 1 end
  82. if input:sub(i, i) == RIGHTBRACKET then i = i + 1 end
  83. while i <= e do
  84. if input:find('^[\r\n]', i) then
  85. return false
  86. elseif input:sub(i, i) == RIGHTBRACKET then
  87. return i
  88. elseif input:sub(i, i + 1) == '[:' then
  89. i = eatCharacterClass(input, i + 2, e)
  90. if not i then return false end
  91. elseif input:sub(i, i) == BACKSLASH then
  92. i = i + 1
  93. if input:sub(i, i + 1) == CRLF then i = i + 1 end
  94. end
  95. i = i + 1
  96. end
  97. return false
  98. end
  99. local function eatRegex(input, i)
  100. local e = #input
  101. while i <= e do
  102. if input:find('^[\r\n]', i) then
  103. return false
  104. elseif input:sub(i, i) == SLASH then
  105. LastRegexEnd = i
  106. return i
  107. elseif input:sub(i, i) == LEFTBRACKET then
  108. i = eatBrackets(input, i + 1, e)
  109. if not i then return false end
  110. elseif input:sub(i, i) == BACKSLASH then
  111. i = i + 1
  112. if input:sub(i, i + 1) == CRLF then i = i + 1 end
  113. end
  114. i = i + 1
  115. end
  116. return false
  117. end
  118. local ScanRegexResult
  119. local function scanGawkRegex(input, index)
  120. if isRegex(input, index - 2) then
  121. local i = eatRegex(input, index)
  122. if not i then
  123. ScanRegexResult = false
  124. return false
  125. end
  126. local rx = input:sub(index - 1, i)
  127. for bs in rx:gmatch("[^\\](\\+)[BSsWwy<>`']") do
  128. -- /\S/ is special, but /\\S/ is not.
  129. if #bs % 2 == 1 then return i + 1 end
  130. end
  131. ScanRegexResult = i + 1
  132. else
  133. ScanRegexResult = false
  134. end
  135. return false
  136. end
  137. -- Is only called immediately after scanGawkRegex().
  138. local function scanRegex() return ScanRegexResult end
  139. local function scanString(input, index)
  140. local i = index
  141. local e = #input
  142. while i <= e do
  143. if input:find('^[\r\n]', i) then
  144. return false
  145. elseif input:sub(i, i) == DQUOTE then
  146. return i + 1
  147. elseif input:sub(i, i) == BACKSLASH then
  148. i = i + 1
  149. -- lexer.range() doesn't handle CRLF.
  150. if input:sub(i, i + 1) == CRLF then i = i + 1 end
  151. end
  152. i = i + 1
  153. end
  154. return false
  155. end
  156. -- purpose: prevent isRegex() from entering a comment line that ends with a backslash.
  157. local function scanComment(input, index)
  158. local _, i = input:find('[^\r\n]*', index)
  159. if input:sub(i, i) == BACKSLASH then BackslashAtCommentEnd = i end
  160. return i + 1
  161. end
  162. local function scanFieldDelimiters(input, index)
  163. local i = index
  164. local e = #input
  165. local left = input:sub(i - 1, i - 1)
  166. local count = 1
  167. local right = DELIMITER_MATCHES[left]
  168. local left2 = COMPANION[left]
  169. local count2 = 0
  170. local right2 = DELIMITER_MATCHES[left2]
  171. while i <= e do
  172. if input:find('^[#\r\n]', i) then
  173. return false
  174. elseif input:sub(i, i) == right then
  175. count = count - 1
  176. if count == 0 then return count2 == 0 and i + 1 end
  177. elseif input:sub(i, i) == left then
  178. count = count + 1
  179. elseif input:sub(i, i) == right2 then
  180. count2 = count2 - 1
  181. if count2 < 0 then return false end
  182. elseif input:sub(i, i) == left2 then
  183. count2 = count2 + 1
  184. elseif input:sub(i, i) == DQUOTE then
  185. i = scanString(input, i + 1)
  186. if not i then return false end
  187. i = i - 1
  188. elseif input:sub(i, i) == SLASH then
  189. if isRegex(input, i - 1) then
  190. i = eatRegex(input, i + 1)
  191. if not i then return false end
  192. end
  193. elseif input:sub(i, i) == BACKSLASH then
  194. if input:sub(i + 1, i + 2) == CRLF then
  195. i = i + 2
  196. elseif input:find('^[\r\n]', i + 1) then
  197. i = i + 1
  198. end
  199. end
  200. i = i + 1
  201. end
  202. return false
  203. end
  204. -- Comments.
  205. lex:add_rule('comment', lex:tag(lexer.COMMENT, '#' * P(scanComment)))
  206. -- Strings.
  207. lex:add_rule('string', lex:tag(lexer.STRING, DQUOTE * P(scanString)))
  208. -- No leading sign because it might be binary.
  209. local float = ((lexer.digit^1 * ('.' * lexer.digit^0)^-1) + ('.' * lexer.digit^1)) *
  210. (S('eE') * S('+-')^-1 * lexer.digit^1)^-1
  211. -- Fields. E.g. $1, $a, $(x), $a(x), $a[x], $"1", $$a, etc.
  212. lex:add_rule('field', lex:tag(lexer.VARIABLE .. '.field', '$' * S('$+-')^0 *
  213. (float + lexer.word^0 * '(' * P(scanFieldDelimiters) + lexer.word^1 *
  214. ('[' * P(scanFieldDelimiters))^-1 + '"' * P(scanString) + '/' * P(eatRegex) * '/')))
  215. -- Regular expressions.
  216. -- Slash delimited regular expressions are preceded by most operators or the keywords 'print'
  217. -- and 'case', possibly on a preceding line. They can contain unescaped slashes and brackets
  218. -- in brackets. Some escape sequences like '\S', '\s' have special meanings with Gawk. Tokens
  219. -- that contain them are displayed differently.
  220. lex:add_rule('gawkRegex', lex:tag(lexer.REGEX .. '.gawk', SLASH * P(scanGawkRegex)))
  221. lex:add_rule('regex', lex:tag(lexer.REGEX, SLASH * P(scanRegex)))
  222. -- Operators.
  223. lex:add_rule('gawkOperator', lex:tag(lexer.OPERATOR .. '.gawk', P("|&") + "@" + "**=" + "**"))
  224. lex:add_rule('operator', lex:tag(lexer.OPERATOR, S('!%&()*+,-/:;<=>?[\\]^{|}~')))
  225. -- Numbers.
  226. lex:add_rule('gawkNumber', lex:tag(lexer.NUMBER .. '.gawk', lexer.hex_num + lexer.oct_num))
  227. lex:add_rule('number', lex:tag(lexer.NUMBER, float))
  228. -- Keywords.
  229. lex:add_rule('keyword', lex:tag(lexer.KEYWORD, lex:word_match(lexer.KEYWORD)))
  230. lex:add_rule('builtInVariable',
  231. lex:tag(lexer.VARIABLE_BUILTIN, lex:word_match(lexer.VARIABLE_BUILTIN)))
  232. lex:add_rule('gawkBuiltInVariable', lex:tag(lexer.VARIABLE_BUILTIN .. '.gawk',
  233. lex:word_match(lexer.VARIABLE_BUILTIN .. '.gawk')))
  234. -- Functions.
  235. local builtin_func = lex:tag(lexer.FUNCTION_BUILTIN, lex:word_match(lexer.FUNCTION_BUILTIN))
  236. local func = lex:tag(lexer.FUNCTION, lexer.word)
  237. lex:add_rule('function', (builtin_func + func) * #P('('))
  238. -- Identifiers.
  239. lex:add_rule('identifier', lex:tag(lexer.IDENTIFIER, lexer.word))
  240. -- Fold points.
  241. lex:add_fold_point(lexer.OPERATOR, '{', '}')
  242. -- Word lists.
  243. lex:set_word_list(lexer.KEYWORD, {
  244. 'BEGIN', 'END', 'break', 'continue', 'do', 'else', 'for', 'if', 'in', 'while', --
  245. 'delete', -- array
  246. 'print', 'printf', 'getline', 'close', 'fflush', 'system', -- I/O
  247. 'function', 'return', -- functions
  248. 'next', 'nextfile', 'exit' -- program execution
  249. })
  250. lex:set_word_list(lexer.FUNCTION_BUILTIN, {
  251. 'gsub', 'index', 'length', 'match', 'split', 'sprintf', 'sub', 'substr', 'tolower', 'toupper', -- string
  252. 'mktime', 'strftime', 'systime', -- time
  253. 'atan2', 'cos', 'exp', 'int', 'log', 'rand', 'sin', 'sqrt', 'srand' -- arithmetic
  254. })
  255. lex:set_word_list(lexer.VARIABLE_BUILTIN, {
  256. 'ARGC', 'ARGV', 'CONVFMT', 'ENVIRON', 'FILENAME', 'FNR', 'FS', 'NF', 'NR', 'OFMT', 'OFS', 'ORS',
  257. 'RLENGTH', 'RS', 'RSTART', 'SUBSEP'
  258. })
  259. lex:set_word_list(lexer.VARIABLE_BUILTIN .. '.gawk', {
  260. 'ARGIND', 'BINMODE', 'ERRNO', 'FIELDWIDTHS', 'FPAT', 'FUNCTAB', 'IGNORECASE', 'LINT', 'PREC',
  261. 'PROCINFO', 'ROUNDMODE', 'RT', 'SYMTAB', 'TEXTDOMAIN'
  262. })
  263. lexer.property['scintillua.comment'] = '#'
  264. return lex