logo

auto_linker

AutoLinker-shim, based on https://git.pleroma.social/pleroma/auto_linker git clone https://hacktivis.me/git/auto_linker.git

parser.ex (10341B)


  1. defmodule Linkify.Parser do
  2. @moduledoc """
  3. Module to handle parsing the the input string.
  4. """
  5. alias Linkify.Builder
  6. @match_hashtag ~r/^(?<tag>\#[[:word:]_]*[[:alpha:]_·][[:word:]_·\p{M}]*)/u
  7. @match_skipped_tag ~r/^(?<tag>(a|code|pre)).*>*/
  8. @delimiters ~r/[,.;:>?!]*$/
  9. @default_opts %{
  10. url: true,
  11. validate_tld: true
  12. }
  13. @doc """
  14. Parse the given string, identifying items to link.
  15. Parses the string, replacing the matching urls with an html link.
  16. ## Examples
  17. iex> Linkify.Parser.parse("Check out http://google.com")
  18. ~s{Check out <a href="http://google.com">http://google.com</a>}
  19. """
  20. @types [:url, :hashtag, :mention, :email]
  21. def parse(input, opts \\ %{})
  22. def parse(input, opts) when is_binary(input), do: {input, %{}} |> parse(opts) |> elem(0)
  23. def parse(input, list) when is_list(list), do: parse(input, Enum.into(list, %{}))
  24. def parse(input, opts) do
  25. opts = Map.merge(@default_opts, opts)
  26. {buffer, user_acc} = do_parse(input, opts, {"", [], :parsing})
  27. if opts[:iodata] do
  28. {buffer, user_acc}
  29. else
  30. {IO.iodata_to_binary(buffer), user_acc}
  31. end
  32. end
  33. defp accumulate(acc, buffer),
  34. do: [buffer | acc]
  35. defp accumulate(acc, buffer, trailing),
  36. do: [trailing, buffer | acc]
  37. defp do_parse({"", user_acc}, _opts, {"", acc, _}),
  38. do: {Enum.reverse(acc), user_acc}
  39. defp do_parse(
  40. {"<" <> text, user_acc},
  41. %{hashtag: true} = opts,
  42. {"#" <> _ = buffer, acc, :parsing}
  43. ) do
  44. {buffer, user_acc} = link(buffer, opts, user_acc)
  45. case Regex.run(@match_skipped_tag, text, capture: [:tag]) do
  46. [tag] ->
  47. text = String.trim_leading(text, tag)
  48. do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<#{tag}"), :skip})
  49. nil ->
  50. do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
  51. end
  52. end
  53. defp do_parse({"<br" <> text, user_acc}, opts, {buffer, acc, :parsing}) do
  54. {buffer, user_acc} = link(buffer, opts, user_acc)
  55. do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<br"), {:open, 1}})
  56. end
  57. defp do_parse({"<a" <> text, user_acc}, opts, {buffer, acc, :parsing}),
  58. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<a"), :skip})
  59. defp do_parse({"<pre" <> text, user_acc}, opts, {buffer, acc, :parsing}),
  60. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<pre"), :skip})
  61. defp do_parse({"<code" <> text, user_acc}, opts, {buffer, acc, :parsing}),
  62. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "<code"), :skip})
  63. defp do_parse({"</a>" <> text, user_acc}, opts, {buffer, acc, :skip}),
  64. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</a>"), :parsing})
  65. defp do_parse({"</pre>" <> text, user_acc}, opts, {buffer, acc, :skip}),
  66. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</pre>"), :parsing})
  67. defp do_parse({"</code>" <> text, user_acc}, opts, {buffer, acc, :skip}),
  68. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, "</code>"), :parsing})
  69. defp do_parse({"<" <> text, user_acc}, opts, {"", acc, :parsing}),
  70. do: do_parse({text, user_acc}, opts, {"<", acc, {:open, 1}})
  71. defp do_parse({">" <> text, user_acc}, opts, {buffer, acc, {:attrs, _level}}),
  72. do: do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer, ">"), :parsing})
  73. defp do_parse({<<ch::8>> <> text, user_acc}, opts, {"", acc, {:attrs, level}}) do
  74. do_parse({text, user_acc}, opts, {"", accumulate(acc, <<ch::8>>), {:attrs, level}})
  75. end
  76. defp do_parse({text, user_acc}, opts, {buffer, acc, {:open, level}}) do
  77. do_parse({text, user_acc}, opts, {"", accumulate(acc, buffer), {:attrs, level}})
  78. end
  79. defp do_parse(
  80. {<<char::bytes-size(1), text::binary>>, user_acc},
  81. opts,
  82. {buffer, acc, state}
  83. )
  84. when char in [" ", "\r", "\n"] do
  85. {buffer, user_acc} = link(buffer, opts, user_acc)
  86. do_parse(
  87. {text, user_acc},
  88. opts,
  89. {"", accumulate(acc, buffer, char), state}
  90. )
  91. end
  92. defp do_parse({<<ch::8>>, user_acc}, opts, {buffer, acc, state}) do
  93. {buffer, user_acc} = link(buffer <> <<ch::8>>, opts, user_acc)
  94. do_parse(
  95. {"", user_acc},
  96. opts,
  97. {"", accumulate(acc, buffer), state}
  98. )
  99. end
  100. defp do_parse({<<ch::8>> <> text, user_acc}, opts, {buffer, acc, state}),
  101. do: do_parse({text, user_acc}, opts, {buffer <> <<ch::8>>, acc, state})
  102. def check_and_link(:url, buffer, opts, _user_acc) do
  103. if url?(buffer, opts) do
  104. link_url(buffer, opts)
  105. else
  106. :nomatch
  107. end
  108. end
  109. def check_and_link(:email, buffer, opts, _user_acc) do
  110. if email?(buffer, opts), do: link_email(buffer, opts), else: :nomatch
  111. end
  112. def check_and_link(:mention, buffer, opts, user_acc) do
  113. buffer
  114. |> match_mention
  115. |> link_mention(buffer, opts, user_acc)
  116. end
  117. def check_and_link(:hashtag, buffer, opts, user_acc) do
  118. buffer
  119. |> match_hashtag
  120. |> link_hashtag(buffer, opts, user_acc)
  121. end
  122. defp strip_parens(buffer) do
  123. buffer
  124. |> String.trim_leading("(")
  125. |> String.trim_trailing(")")
  126. end
  127. defp strip_punctuation(buffer), do: String.replace(buffer, @delimiters, "")
  128. def url?(buffer, opts) do
  129. prefixes = ["http://", "https://", "mailto:"] ++ (opts[:extra_prefixes] || [])
  130. valid_url?(buffer) && String.starts_with?(buffer, prefixes) && valid_tld?(buffer, opts)
  131. end
  132. def email?(buffer, opts) do
  133. # Note: In reality the local part can only be checked by the remote server
  134. case Regex.run(~r/^(?<user>.*)@(?<host>[^@]+)$/, buffer, capture: [:user, :host]) do
  135. [_user, hostname] -> valid_hostname?(hostname) && valid_tld?(hostname, opts)
  136. _ -> false
  137. end
  138. end
  139. defp valid_url?(url) do
  140. # ~r/^[0-9a-z+\-\.]+:[0-9a-z\-\._~!$&'()*+,;=:@?\/#%]+$/ui
  141. case String.split(url, ":", parts: 2) do
  142. [scheme, rest] ->
  143. valid_scheme? = Regex.match?(~r/^[0-9a-z+\-\.]+/, scheme)
  144. valid_rest? =
  145. rest
  146. |> String.to_charlist()
  147. |> Enum.any?(fn s ->
  148. !(s >= 0x80 || s in 0x30..0x39 || s in 0x41..0x5A || s in 0x61..0x7A ||
  149. s in '-._~!$&\'()*+,;=:@?/#%')
  150. end)
  151. |> Kernel.!()
  152. valid_scheme? && valid_rest?
  153. _ ->
  154. false
  155. end
  156. end
  157. @doc """
  158. Validates a URL's TLD. Returns a boolean.
  159. """
  160. def valid_tld?(_url, _opts), do: true
  161. def safe_to_integer(string, base \\ 10) do
  162. String.to_integer(string, base)
  163. rescue
  164. _ ->
  165. nil
  166. end
  167. def ip?(buffer) do
  168. v4 = String.split(buffer, ".")
  169. v6 =
  170. buffer
  171. |> String.trim_leading("[")
  172. |> String.trim_trailing("]")
  173. |> String.split(":", trim: true)
  174. cond do
  175. length(v4) == 4 ->
  176. !Enum.any?(v4, fn x -> safe_to_integer(x, 10) not in 0..255 end)
  177. length(v6) in 1..8 ->
  178. !Enum.any?(v4, fn x -> safe_to_integer(x, 16) not in 0..0xFFFF end)
  179. false ->
  180. false
  181. end
  182. end
  183. # IDN-compatible, ported from musl-libc's is_valid_hostname()
  184. def valid_hostname?(hostname) do
  185. hostname
  186. |> String.to_charlist()
  187. |> Enum.any?(fn s ->
  188. !(s >= 0x80 || s in 0x30..0x39 || s in 0x41..0x5A || s in 0x61..0x7A || s in '.-')
  189. end)
  190. |> Kernel.!()
  191. end
  192. def match_mention(buffer) do
  193. case Regex.run(~r/^@(?<user>[a-zA-Z\d_-]+)(@(?<host>[^@]+))?$/, buffer,
  194. capture: [:user, :host]
  195. ) do
  196. [user, ""] ->
  197. "@" <> user
  198. [user, hostname] ->
  199. if valid_hostname?(hostname) && valid_tld?(hostname, []),
  200. do: "@" <> user <> "@" <> hostname,
  201. else: nil
  202. _ ->
  203. nil
  204. end
  205. end
  206. def match_hashtag(buffer) do
  207. case Regex.run(@match_hashtag, buffer, capture: [:tag]) do
  208. [hashtag] -> hashtag
  209. _ -> nil
  210. end
  211. end
  212. def link_hashtag(nil, _buffer, _, _user_acc), do: :nomatch
  213. def link_hashtag(hashtag, buffer, %{hashtag_handler: hashtag_handler} = opts, user_acc) do
  214. hashtag
  215. |> hashtag_handler.(buffer, opts, user_acc)
  216. |> maybe_update_buffer(hashtag, buffer)
  217. end
  218. def link_hashtag(hashtag, buffer, opts, _user_acc) do
  219. hashtag
  220. |> Builder.create_hashtag_link(buffer, opts)
  221. |> maybe_update_buffer(hashtag, buffer)
  222. end
  223. def link_mention(nil, _buffer, _, _user_acc), do: :nomatch
  224. def link_mention(mention, buffer, %{mention_handler: mention_handler} = opts, user_acc) do
  225. mention
  226. |> mention_handler.(buffer, opts, user_acc)
  227. |> maybe_update_buffer(mention, buffer)
  228. end
  229. def link_mention(mention, buffer, opts, _user_acc) do
  230. mention
  231. |> Builder.create_mention_link(buffer, opts)
  232. |> maybe_update_buffer(mention, buffer)
  233. end
  234. defp maybe_update_buffer(out, match, buffer) when is_binary(out) do
  235. maybe_update_buffer({out, nil}, match, buffer)
  236. end
  237. defp maybe_update_buffer({out, user_acc}, match, buffer)
  238. when match != buffer and out != buffer do
  239. out = String.replace(buffer, match, out)
  240. {out, user_acc}
  241. end
  242. defp maybe_update_buffer(out, _match, _buffer), do: out
  243. @doc false
  244. def link_url(buffer, opts) do
  245. Builder.create_link(buffer, opts)
  246. end
  247. @doc false
  248. def link_email(buffer, opts) do
  249. Builder.create_email_link(buffer, opts)
  250. end
  251. def link_extra(buffer, opts) do
  252. Builder.create_extra_link(buffer, opts)
  253. end
  254. defp link(buffer, opts, user_acc) do
  255. Enum.reduce_while(@types, {buffer, user_acc}, fn type, _ ->
  256. if opts[type] == true do
  257. check_and_link_reducer(type, buffer, opts, user_acc)
  258. else
  259. {:cont, {buffer, user_acc}}
  260. end
  261. end)
  262. end
  263. defp check_and_link_reducer(type, buffer, opts, user_acc) do
  264. str =
  265. buffer
  266. |> String.split("<")
  267. |> List.first()
  268. |> strip_punctuation()
  269. |> strip_parens()
  270. case check_and_link(type, str, opts, user_acc) do
  271. :nomatch ->
  272. {:cont, {buffer, user_acc}}
  273. {link, user_acc} ->
  274. {:halt, {restore_stripped_symbols(buffer, str, link), user_acc}}
  275. link ->
  276. {:halt, {restore_stripped_symbols(buffer, str, link), user_acc}}
  277. end
  278. end
  279. defp restore_stripped_symbols(buffer, buffer, link), do: link
  280. defp restore_stripped_symbols(buffer, stripped_buffer, link) do
  281. buffer
  282. |> String.split(stripped_buffer)
  283. |> Enum.intersperse(link)
  284. end
  285. end