logo

pleroma

My custom branche(s) on git.pleroma.social/pleroma/pleroma git clone https://hacktivis.me/git/pleroma.git

parser.ex (5705B)


  1. # Pleroma: A lightweight social networking server
  2. # Copyright © 2017-2022 Pleroma Authors <https://pleroma.social/>
  3. # SPDX-License-Identifier: AGPL-3.0-only
  4. defmodule Pleroma.Web.RichMedia.Parser do
  5. require Logger
  6. @cachex Pleroma.Config.get([:cachex, :provider], Cachex)
  7. @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
  8. defp parsers do
  9. Pleroma.Config.get([:rich_media, :parsers])
  10. end
  11. def parse(nil), do: {:error, "No URL provided"}
  12. @spec parse(String.t()) :: {:ok, map()} | {:error, any()}
  13. def parse(url) do
  14. with :ok <- validate_page_url(url),
  15. {:ok, data} <- get_cached_or_parse(url),
  16. {:ok, _} <- set_ttl_based_on_image(data, url) do
  17. {:ok, data}
  18. end
  19. end
  20. defp get_cached_or_parse(url) do
  21. case @cachex.fetch(:rich_media_cache, url, fn ->
  22. case parse_url(url) do
  23. {:ok, _} = res ->
  24. {:commit, res}
  25. {:error, reason} = e ->
  26. # Unfortunately we have to log errors here, instead of doing that
  27. # along with ttl setting at the bottom. Otherwise we can get log spam
  28. # if more than one process was waiting for the rich media card
  29. # while it was generated. Ideally we would set ttl here as well,
  30. # so we don't override it number_of_waiters_on_generation
  31. # times, but one, obviously, can't set ttl for not-yet-created entry
  32. # and Cachex doesn't support returning ttl from the fetch callback.
  33. log_error(url, reason)
  34. {:commit, e}
  35. end
  36. end) do
  37. {action, res} when action in [:commit, :ok] ->
  38. case res do
  39. {:ok, _data} = res ->
  40. res
  41. {:error, reason} = e ->
  42. if action == :commit, do: set_error_ttl(url, reason)
  43. e
  44. end
  45. {:error, e} ->
  46. {:error, {:cachex_error, e}}
  47. end
  48. end
  49. defp set_error_ttl(_url, :body_too_large), do: :ok
  50. defp set_error_ttl(_url, {:content_type, _}), do: :ok
  51. # The TTL is not set for the errors above, since they are unlikely to change
  52. # with time
  53. defp set_error_ttl(url, _reason) do
  54. ttl = Pleroma.Config.get([:rich_media, :failure_backoff], 60_000)
  55. @cachex.expire(:rich_media_cache, url, ttl)
  56. :ok
  57. end
  58. defp log_error(url, {:invalid_metadata, data}) do
  59. Logger.debug(fn -> "Incomplete or invalid metadata for #{url}: #{inspect(data)}" end)
  60. end
  61. defp log_error(url, reason) do
  62. Logger.warning(fn -> "Rich media error for #{url}: #{inspect(reason)}" end)
  63. end
  64. @doc """
  65. Set the rich media cache based on the expiration time of image.
  66. Adopt behaviour `Pleroma.Web.RichMedia.Parser.TTL`
  67. ## Example
  68. defmodule MyModule do
  69. @behaviour Pleroma.Web.RichMedia.Parser.TTL
  70. def ttl(data, url) do
  71. image_url = Map.get(data, :image)
  72. # do some parsing in the url and get the ttl of the image
  73. # and return ttl is unix time
  74. parse_ttl_from_url(image_url)
  75. end
  76. end
  77. Define the module in the config
  78. config :pleroma, :rich_media,
  79. ttl_setters: [MyModule]
  80. """
  81. @spec set_ttl_based_on_image(map(), String.t()) ::
  82. {:ok, integer() | :noop} | {:error, :no_key}
  83. def set_ttl_based_on_image(data, url) do
  84. case get_ttl_from_image(data, url) do
  85. ttl when is_number(ttl) ->
  86. ttl = ttl * 1000
  87. case @cachex.expire_at(:rich_media_cache, url, ttl) do
  88. {:ok, true} -> {:ok, ttl}
  89. {:ok, false} -> {:error, :no_key}
  90. end
  91. _ ->
  92. {:ok, :noop}
  93. end
  94. end
  95. defp get_ttl_from_image(data, url) do
  96. [:rich_media, :ttl_setters]
  97. |> Pleroma.Config.get()
  98. |> Enum.reduce({:ok, nil}, fn
  99. module, {:ok, _ttl} ->
  100. module.ttl(data, url)
  101. _, error ->
  102. error
  103. end)
  104. end
  105. def parse_url(url) do
  106. with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
  107. {:ok, html} <- Floki.parse_document(html) do
  108. html
  109. |> maybe_parse()
  110. |> Map.put("url", url)
  111. |> clean_parsed_data()
  112. |> check_parsed_data()
  113. end
  114. end
  115. defp maybe_parse(html) do
  116. Enum.reduce_while(parsers(), %{}, fn parser, acc ->
  117. case parser.parse(html, acc) do
  118. data when data != %{} -> {:halt, data}
  119. _ -> {:cont, acc}
  120. end
  121. end)
  122. end
  123. defp check_parsed_data(%{"title" => title} = data)
  124. when is_binary(title) and title != "" do
  125. {:ok, data}
  126. end
  127. defp check_parsed_data(data) do
  128. {:error, {:invalid_metadata, data}}
  129. end
  130. defp clean_parsed_data(data) do
  131. data
  132. |> Enum.reject(fn {key, val} ->
  133. not match?({:ok, _}, Jason.encode(%{key => val}))
  134. end)
  135. |> Map.new()
  136. end
  137. @spec validate_page_url(URI.t() | binary()) :: :ok | :error
  138. defp validate_page_url(page_url) when is_binary(page_url) do
  139. validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld])
  140. page_url
  141. |> Linkify.Parser.url?(scheme: true, validate_tld: validate_tld)
  142. |> parse_uri(page_url)
  143. end
  144. defp validate_page_url(%URI{host: host, scheme: "https"}) do
  145. cond do
  146. Linkify.Parser.ip?(host) ->
  147. :error
  148. host in @config_impl.get([:rich_media, :ignore_hosts], []) ->
  149. :error
  150. get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) ->
  151. :error
  152. true ->
  153. :ok
  154. end
  155. end
  156. defp validate_page_url(_), do: :error
  157. defp parse_uri(true, url) do
  158. url
  159. |> URI.parse()
  160. |> validate_page_url
  161. end
  162. defp parse_uri(_, _), do: :error
  163. defp get_tld(host) do
  164. host
  165. |> String.split(".")
  166. |> Enum.reverse()
  167. |> hd
  168. end
  169. end