logo

qmk_firmware

custom branch of QMK firmware git clone https://anongit.hacktivis.me/git/qmk_firmware.git

autocorrect_data.py (13473B)


  1. # Copyright 2021 Google LLC
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # https://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Python program to make autocorrect_data.h.
  15. This program reads from a prepared dictionary file and generates a C source file
  16. "autocorrect_data.h" with a serialized trie embedded as an array. Run this
  17. program and pass it as the first argument like:
  18. $ qmk generate-autocorrect-data autocorrect_dict.txt
  19. Each line of the dict file defines one typo and its correction with the syntax
  20. "typo -> correction". Blank lines or lines starting with '#' are ignored.
  21. Example:
  22. :thier -> their
  23. fitler -> filter
  24. lenght -> length
  25. ouput -> output
  26. widht -> width
  27. For full documentation, see QMK Docs
  28. """
  29. import textwrap
  30. from typing import Any, Dict, Iterator, List, Tuple
  31. from milc import cli
  32. from qmk.commands import dump_lines
  33. from qmk.constants import GPL2_HEADER_C_LIKE, GENERATED_HEADER_C_LIKE
  34. from qmk.keyboard import keyboard_completer, keyboard_folder
  35. from qmk.keymap import keymap_completer, locate_keymap
  36. from qmk.path import normpath
  37. from qmk.util import maybe_exit
  38. KC_A = 4
  39. KC_SPC = 0x2c
  40. KC_QUOT = 0x34
  41. TYPO_CHARS = dict([
  42. ("'", KC_QUOT),
  43. (':', KC_SPC), # "Word break" character.
  44. ] + [(chr(c), c + KC_A - ord('a')) for c in range(ord('a'),
  45. ord('z') + 1)]) # Characters a-z.
  46. def parse_file(file_name: str) -> List[Tuple[str, str]]:
  47. """Parses autocorrections dictionary file.
  48. Each line of the file defines one typo and its correction with the syntax
  49. "typo -> correction". Blank lines or lines starting with '#' are ignored. The
  50. function validates that typos only have characters a-z and that typos are not
  51. substrings of other typos, otherwise the longer typo would never trigger.
  52. Args:
  53. file_name: String, path of the autocorrections dictionary.
  54. Returns:
  55. List of (typo, correction) tuples.
  56. """
  57. try:
  58. import english_words
  59. correct_words = english_words.get_english_words_set(['web2'], lower=True, alpha=True)
  60. except AttributeError:
  61. from english_words import english_words_lower_alpha_set as correct_words
  62. if not cli.args.quiet:
  63. cli.echo('The english_words package is outdated, update by running:')
  64. cli.echo(' {fg_cyan}python3 -m pip install english_words --upgrade')
  65. except ImportError:
  66. if not cli.args.quiet:
  67. cli.echo('Autocorrection will falsely trigger when a typo is a substring of a correctly spelled word.')
  68. cli.echo('To check for this, install the english_words package and rerun this script:')
  69. cli.echo(' {fg_cyan}python3 -m pip install english_words')
  70. # Use a minimal word list as a fallback.
  71. correct_words = ('information', 'available', 'international', 'language', 'loosest', 'reference', 'wealthier', 'entertainment', 'association', 'provides', 'technology', 'statehood')
  72. autocorrections = []
  73. typos = set()
  74. for line_number, typo, correction in parse_file_lines(file_name):
  75. if typo in typos:
  76. cli.log.warning('{fg_red}Error:%d:{fg_reset} Ignoring duplicate typo: "{fg_cyan}%s{fg_reset}"', line_number, typo)
  77. continue
  78. # Check that `typo` is valid.
  79. if not (all([c in TYPO_CHARS for c in typo])):
  80. cli.log.error('{fg_red}Error:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" has characters other than a-z, \' and :.', line_number, typo)
  81. maybe_exit(1)
  82. for other_typo in typos:
  83. if typo in other_typo or other_typo in typo:
  84. cli.log.error('{fg_red}Error:%d:{fg_reset} Typos may not be substrings of one another, otherwise the longer typo would never trigger: "{fg_cyan}%s{fg_reset}" vs. "{fg_cyan}%s{fg_reset}".', line_number, typo, other_typo)
  85. maybe_exit(1)
  86. if len(typo) < 5:
  87. cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} It is suggested that typos are at least 5 characters long to avoid false triggers: "{fg_cyan}%s{fg_reset}"', line_number, typo)
  88. if len(typo) > 127:
  89. cli.log.error('{fg_red}Error:%d:{fg_reset} Typo exceeds 127 chars: "{fg_cyan}%s{fg_reset}"', line_number, typo)
  90. maybe_exit(1)
  91. check_typo_against_dictionary(typo, line_number, correct_words)
  92. autocorrections.append((typo, correction))
  93. typos.add(typo)
  94. return autocorrections
  95. def make_trie(autocorrections: List[Tuple[str, str]]) -> Dict[str, Any]:
  96. """Makes a trie from the the typos, writing in reverse.
  97. Args:
  98. autocorrections: List of (typo, correction) tuples.
  99. Returns:
  100. Dict of dict, representing the trie.
  101. """
  102. trie = {}
  103. for typo, correction in autocorrections:
  104. node = trie
  105. for letter in typo[::-1]:
  106. node = node.setdefault(letter, {})
  107. node['LEAF'] = (typo, correction)
  108. return trie
  109. def parse_file_lines(file_name: str) -> Iterator[Tuple[int, str, str]]:
  110. """Parses lines read from `file_name` into typo-correction pairs."""
  111. line_number = 0
  112. for line in open(file_name, 'rt'):
  113. line_number += 1
  114. line = line.strip()
  115. if line and line[0] != '#':
  116. # Parse syntax "typo -> correction", using strip to ignore indenting.
  117. tokens = [token.strip() for token in line.split('->', 1)]
  118. if len(tokens) != 2 or not tokens[0]:
  119. print(f'Error:{line_number}: Invalid syntax: "{line}"')
  120. maybe_exit(1)
  121. typo, correction = tokens
  122. typo = typo.lower() # Force typos to lowercase.
  123. typo = typo.replace(' ', ':')
  124. yield line_number, typo, correction
  125. def check_typo_against_dictionary(typo: str, line_number: int, correct_words) -> None:
  126. """Checks `typo` against English dictionary words."""
  127. if typo.startswith(':') and typo.endswith(':'):
  128. if typo[1:-1] in correct_words:
  129. cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" is a correctly spelled dictionary word.', line_number, typo)
  130. elif typo.startswith(':') and not typo.endswith(':'):
  131. for word in correct_words:
  132. if word.startswith(typo[1:]):
  133. cli.log.warning('{fg_yellow}Warning:%d: {fg_reset}Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)
  134. elif not typo.startswith(':') and typo.endswith(':'):
  135. for word in correct_words:
  136. if word.endswith(typo[:-1]):
  137. cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)
  138. elif not typo.startswith(':') and not typo.endswith(':'):
  139. for word in correct_words:
  140. if typo in word:
  141. cli.log.warning('{fg_yellow}Warning:%d:{fg_reset} Typo "{fg_cyan}%s{fg_reset}" would falsely trigger on correctly spelled word "{fg_cyan}%s{fg_reset}".', line_number, typo, word)
  142. def serialize_trie(autocorrections: List[Tuple[str, str]], trie: Dict[str, Any]) -> List[int]:
  143. """Serializes trie and correction data in a form readable by the C code.
  144. Args:
  145. autocorrections: List of (typo, correction) tuples.
  146. trie: Dict of dicts.
  147. Returns:
  148. List of ints in the range 0-255.
  149. """
  150. table = []
  151. # Traverse trie in depth first order.
  152. def traverse(trie_node):
  153. if 'LEAF' in trie_node: # Handle a leaf trie node.
  154. typo, correction = trie_node['LEAF']
  155. word_boundary_ending = typo[-1] == ':'
  156. typo = typo.strip(':')
  157. i = 0 # Make the autocorrection data for this entry and serialize it.
  158. while i < min(len(typo), len(correction)) and typo[i] == correction[i]:
  159. i += 1
  160. backspaces = len(typo) - i - 1 + word_boundary_ending
  161. assert 0 <= backspaces <= 63
  162. correction = correction[i:]
  163. bs_count = [backspaces + 128]
  164. data = bs_count + list(bytes(correction, 'ascii')) + [0]
  165. entry = {'data': data, 'links': [], 'byte_offset': 0}
  166. table.append(entry)
  167. elif len(trie_node) == 1: # Handle trie node with a single child.
  168. c, trie_node = next(iter(trie_node.items()))
  169. entry = {'chars': c, 'byte_offset': 0}
  170. # It's common for a trie to have long chains of single-child nodes. We
  171. # find the whole chain so that we can serialize it more efficiently.
  172. while len(trie_node) == 1 and 'LEAF' not in trie_node:
  173. c, trie_node = next(iter(trie_node.items()))
  174. entry['chars'] += c
  175. table.append(entry)
  176. entry['links'] = [traverse(trie_node)]
  177. else: # Handle trie node with multiple children.
  178. entry = {'chars': ''.join(sorted(trie_node.keys())), 'byte_offset': 0}
  179. table.append(entry)
  180. entry['links'] = [traverse(trie_node[c]) for c in entry['chars']]
  181. return entry
  182. traverse(trie)
  183. def serialize(e: Dict[str, Any]) -> List[int]:
  184. if not e['links']: # Handle a leaf table entry.
  185. return e['data']
  186. elif len(e['links']) == 1: # Handle a chain table entry.
  187. return [TYPO_CHARS[c] for c in e['chars']] + [0] # + encode_link(e['links'][0]))
  188. else: # Handle a branch table entry.
  189. data = []
  190. for c, link in zip(e['chars'], e['links']):
  191. data += [TYPO_CHARS[c] | (0 if data else 64)] + encode_link(link)
  192. return data + [0]
  193. byte_offset = 0
  194. for e in table: # To encode links, first compute byte offset of each entry.
  195. e['byte_offset'] = byte_offset
  196. byte_offset += len(serialize(e))
  197. assert 0 <= byte_offset <= 0xffff
  198. return [b for e in table for b in serialize(e)] # Serialize final table.
  199. def encode_link(link: Dict[str, Any]) -> List[int]:
  200. """Encodes a node link as two bytes."""
  201. byte_offset = link['byte_offset']
  202. if not (0 <= byte_offset <= 0xffff):
  203. cli.log.error('{fg_red}Error:{fg_reset} The autocorrection table is too large, a node link exceeds 64KB limit. Try reducing the autocorrection dict to fewer entries.')
  204. maybe_exit(1)
  205. return [byte_offset & 255, byte_offset >> 8]
  206. def typo_len(e: Tuple[str, str]) -> int:
  207. return len(e[0])
  208. def to_hex(b: int) -> str:
  209. return f'0x{b:02X}'
  210. @cli.argument('filename', type=normpath, help='The autocorrection database file')
  211. @cli.argument('-kb', '--keyboard', type=keyboard_folder, completer=keyboard_completer, help='The keyboard to build a firmware for. Ignored when a configurator export is supplied.')
  212. @cli.argument('-km', '--keymap', completer=keymap_completer, help='The keymap to build a firmware for. Ignored when a configurator export is supplied.')
  213. @cli.argument('-o', '--output', arg_only=True, type=normpath, help='File to write to')
  214. @cli.argument('-q', '--quiet', arg_only=True, action='store_true', help="Quiet mode, only output error messages")
  215. @cli.subcommand('Generate the autocorrection data file from a dictionary file.')
  216. def generate_autocorrect_data(cli):
  217. autocorrections = parse_file(cli.args.filename)
  218. trie = make_trie(autocorrections)
  219. data = serialize_trie(autocorrections, trie)
  220. current_keyboard = cli.args.keyboard or cli.config.user.keyboard or cli.config.generate_autocorrect_data.keyboard
  221. current_keymap = cli.args.keymap or cli.config.user.keymap or cli.config.generate_autocorrect_data.keymap
  222. if current_keyboard and current_keymap:
  223. cli.args.output = locate_keymap(current_keyboard, current_keymap).parent / 'autocorrect_data.h'
  224. assert all(0 <= b <= 255 for b in data)
  225. min_typo = min(autocorrections, key=typo_len)[0]
  226. max_typo = max(autocorrections, key=typo_len)[0]
  227. # Build the autocorrect_data.h file.
  228. autocorrect_data_h_lines = [GPL2_HEADER_C_LIKE, GENERATED_HEADER_C_LIKE, '#pragma once', '']
  229. autocorrect_data_h_lines.append(f'// Autocorrection dictionary ({len(autocorrections)} entries):')
  230. for typo, correction in autocorrections:
  231. autocorrect_data_h_lines.append(f'// {typo:<{len(max_typo)}} -> {correction}')
  232. autocorrect_data_h_lines.append('')
  233. autocorrect_data_h_lines.append(f'#define AUTOCORRECT_MIN_LENGTH {len(min_typo)} // "{min_typo}"')
  234. autocorrect_data_h_lines.append(f'#define AUTOCORRECT_MAX_LENGTH {len(max_typo)} // "{max_typo}"')
  235. autocorrect_data_h_lines.append(f'#define DICTIONARY_SIZE {len(data)}')
  236. autocorrect_data_h_lines.append('')
  237. autocorrect_data_h_lines.append('static const uint8_t autocorrect_data[DICTIONARY_SIZE] PROGMEM = {')
  238. autocorrect_data_h_lines.append(textwrap.fill(' %s' % (', '.join(map(to_hex, data))), width=100, subsequent_indent=' '))
  239. autocorrect_data_h_lines.append('};')
  240. # Show the results
  241. dump_lines(cli.args.output, autocorrect_data_h_lines, cli.args.quiet)