logo

mastofe

My custom branche(s) on git.pleroma.social/pleroma/mastofe
commit: 0717d9b3e6904a4dcd5d2dc9e680cc5b21c50e51
parent: 6e4046fc3f3973ba0b6994930a8b58726e507003
Author: Eugen Rochko <eugen@zeonfederated.com>
Date:   Sun,  8 Oct 2017 17:34:34 +0200

Set snowflake IDs for backdated statuses (#5260)

- Rename Mastodon::TimestampIds into Mastodon::Snowflake for clarity
- Skip for statuses coming from inbox, aka delivered in real-time
- Skip for statuses that claim to be from the future

Diffstat:

Mapp/lib/activitypub/activity.rb7++++---
Mapp/lib/activitypub/activity/announce.rb3++-
Mapp/lib/activitypub/activity/create.rb2+-
Mapp/lib/ostatus/activity/base.rb5+++--
Mapp/lib/ostatus/activity/creation.rb2+-
Mapp/lib/ostatus/activity/general.rb2+-
Mapp/models/status.rb2++
Mapp/services/activitypub/process_collection_service.rb5+++--
Mapp/services/process_feed_service.rb6++++--
Mapp/workers/activitypub/processing_worker.rb2+-
Mapp/workers/processing_worker.rb2+-
Mconfig/application.rb1+
Mconfig/brakeman.ignore44++++++++++++++++++++++----------------------
Alib/mastodon/snowflake.rb162+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dlib/mastodon/timestamp_ids.rb131-------------------------------------------------------------------------------
Mlib/tasks/db.rake6+++---
Mspec/services/activitypub/process_collection_service_spec.rb4++--
17 files changed, 213 insertions(+), 173 deletions(-)

diff --git a/app/lib/activitypub/activity.rb b/app/lib/activitypub/activity.rb @@ -3,10 +3,11 @@ class ActivityPub::Activity include JsonLdHelper - def initialize(json, account) + def initialize(json, account, options = {}) @json = json @account = account @object = @json['object'] + @options = options end def perform @@ -14,9 +15,9 @@ class ActivityPub::Activity end class << self - def factory(json, account) + def factory(json, account, options = {}) @json = json - klass&.new(json, account) + klass&.new(json, account, options) end private diff --git a/app/lib/activitypub/activity/announce.rb b/app/lib/activitypub/activity/announce.rb @@ -15,8 +15,9 @@ class ActivityPub::Activity::Announce < ActivityPub::Activity account: @account, reblog: original_status, uri: @json['id'], - created_at: @json['published'] || Time.now.utc + created_at: @options[:override_timestamps] ? nil : @json['published'] ) + distribute(status) status end diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb @@ -43,7 +43,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity text: text_from_content || '', language: language_from_content, spoiler_text: @object['summary'] || '', - created_at: @object['published'] || Time.now.utc, + created_at: @options[:override_timestamps] ? nil : @object['published'], reply: @object['inReplyTo'].present?, sensitive: @object['sensitive'] || false, visibility: visibility_from_audience, diff --git a/app/lib/ostatus/activity/base.rb b/app/lib/ostatus/activity/base.rb @@ -1,9 +1,10 @@ # frozen_string_literal: true class OStatus::Activity::Base - def initialize(xml, account = nil) - @xml = xml + def initialize(xml, account = nil, options = {}) + @xml = xml @account = account + @options = options end def status? diff --git a/app/lib/ostatus/activity/creation.rb b/app/lib/ostatus/activity/creation.rb @@ -34,7 +34,7 @@ class OStatus::Activity::Creation < OStatus::Activity::Base reblog: cached_reblog, text: content, spoiler_text: content_warning, - created_at: published, + created_at: @options[:override_timestamps] ? nil : published, reply: thread?, language: content_language, visibility: visibility_scope, diff --git a/app/lib/ostatus/activity/general.rb b/app/lib/ostatus/activity/general.rb @@ -2,7 +2,7 @@ class OStatus::Activity::General < OStatus::Activity::Base def specialize - special_class&.new(@xml, @account) + special_class&.new(@xml, @account, @options) end private diff --git a/app/models/status.rb b/app/models/status.rb @@ -136,6 +136,8 @@ class Status < ApplicationRecord after_create :store_uri, if: :local? + around_create Mastodon::Snowflake::Callbacks + before_validation :prepare_contents, if: :local? before_validation :set_reblog before_validation :set_visibility diff --git a/app/services/activitypub/process_collection_service.rb b/app/services/activitypub/process_collection_service.rb @@ -3,9 +3,10 @@ class ActivityPub::ProcessCollectionService < BaseService include JsonLdHelper - def call(body, account) + def call(body, account, options = {}) @account = account @json = Oj.load(body, mode: :strict) + @options = options return unless supported_context? return if different_actor? && verify_account!.nil? @@ -38,7 +39,7 @@ class ActivityPub::ProcessCollectionService < BaseService end def process_item(item) - activity = ActivityPub::Activity.factory(item, @account) + activity = ActivityPub::Activity.factory(item, @account, @options) activity&.perform end diff --git a/app/services/process_feed_service.rb b/app/services/process_feed_service.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true class ProcessFeedService < BaseService - def call(body, account) + def call(body, account, options = {}) + @options = options + xml = Nokogiri::XML(body) xml.encoding = 'utf-8' @@ -20,7 +22,7 @@ class ProcessFeedService < BaseService end def process_entry(xml, account) - activity = OStatus::Activity::General.new(xml, account) + activity = OStatus::Activity::General.new(xml, account, @options) activity.specialize&.perform if activity.status? rescue ActiveRecord::RecordInvalid => e Rails.logger.debug "Nothing was saved for #{activity.id} because: #{e}" diff --git a/app/workers/activitypub/processing_worker.rb b/app/workers/activitypub/processing_worker.rb @@ -6,6 +6,6 @@ class ActivityPub::ProcessingWorker sidekiq_options backtrace: true def perform(account_id, body) - ActivityPub::ProcessCollectionService.new.call(body, Account.find(account_id)) + ActivityPub::ProcessCollectionService.new.call(body, Account.find(account_id), override_timestamps: true) end end diff --git a/app/workers/processing_worker.rb b/app/workers/processing_worker.rb @@ -6,6 +6,6 @@ class ProcessingWorker sidekiq_options backtrace: true def perform(account_id, body) - ProcessFeedService.new.call(body, Account.find(account_id)) + ProcessFeedService.new.call(body, Account.find(account_id), override_timestamps: true) end end diff --git a/config/application.rb b/config/application.rb @@ -9,6 +9,7 @@ Bundler.require(*Rails.groups) require_relative '../app/lib/exceptions' require_relative '../lib/paperclip/gif_transcoder' require_relative '../lib/paperclip/video_transcoder' +require_relative '../lib/mastodon/snowflake' require_relative '../lib/mastodon/version' Dotenv::Railtie.load diff --git a/config/brakeman.ignore b/config/brakeman.ignore @@ -58,26 +58,6 @@ "note": "" }, { - "warning_type": "SQL Injection", - "warning_code": 0, - "fingerprint": "34efc76883080f8b1110a30c34ec4f903946ee56651aae46c62477f45d4fc412", - "check_name": "SQL", - "message": "Possible SQL injection", - "file": "lib/mastodon/timestamp_ids.rb", - "line": 63, - "link": "http://brakemanscanner.org/docs/warning_types/sql_injection/", - "code": "connection.execute(\" CREATE OR REPLACE FUNCTION timestamp_id(table_name text)\\n RETURNS bigint AS\\n $$\\n DECLARE\\n time_part bigint;\\n sequence_base bigint;\\n tail bigint;\\n BEGIN\\n time_part := (\\n -- Get the time in milliseconds\\n ((date_part('epoch', now()) * 1000))::bigint\\n -- And shift it over two bytes\\n << 16);\\n\\n sequence_base := (\\n 'x' ||\\n -- Take the first two bytes (four hex characters)\\n substr(\\n -- Of the MD5 hash of the data we documented\\n md5(table_name ||\\n '#{SecureRandom.hex(16)}' ||\\n time_part::text\\n ),\\n 1, 4\\n )\\n -- And turn it into a bigint\\n )::bit(16)::bigint;\\n\\n -- Finally, add our sequence number to our base, and chop\\n -- it to the last two bytes\\n tail := (\\n (sequence_base + nextval(table_name || '_id_seq'))\\n & 65535);\\n\\n -- Return the time part and the sequence part. OR appears\\n -- faster here than addition, but they're equivalent:\\n -- time_part has no trailing two bytes, and tail is only\\n -- the last two bytes.\\n RETURN time_part | tail;\\n END\\n $$ LANGUAGE plpgsql VOLATILE;\\n\")", - "render_path": null, - "location": { - "type": "method", - "class": "Mastodon::TimestampIds", - "method": "define_timestamp_id" - }, - "user_input": "SecureRandom.hex(16)", - "confidence": "Medium", - "note": "" - }, - { "warning_type": "Dynamic Render Path", "warning_code": 15, "fingerprint": "3b0a20b08aef13cf8cf865384fae0cfd3324d8200a83262bf4abbc8091b5fec5", @@ -106,7 +86,7 @@ "line": 3, "link": "http://brakemanscanner.org/docs/warning_types/dynamic_render_path/", "code": "render(action => \"stream_entries/#{Account.find_local!(params[:account_username]).statuses.find(params[:id]).stream_entry.activity_type.downcase}\", { Account.find_local!(params[:account_username]).statuses.find(params[:id]).stream_entry.activity_type.downcase.to_sym => Account.find_local!(params[:account_username]).statuses.find(params[:id]).stream_entry.activity, :centered => true })", - "render_path": [{"type":"controller","class":"StatusesController","method":"embed","line":35,"file":"app/controllers/statuses_controller.rb"}], + "render_path": [{"type":"controller","class":"StatusesController","method":"embed","line":41,"file":"app/controllers/statuses_controller.rb"}], "location": { "type": "template", "template": "stream_entries/embed" @@ -154,6 +134,26 @@ "note": "" }, { + "warning_type": "SQL Injection", + "warning_code": 0, + "fingerprint": "9ccb9ba6a6947400e187d515e0bf719d22993d37cfc123c824d7fafa6caa9ac3", + "check_name": "SQL", + "message": "Possible SQL injection", + "file": "lib/mastodon/snowflake.rb", + "line": 86, + "link": "http://brakemanscanner.org/docs/warning_types/sql_injection/", + "code": "connection.execute(\" CREATE OR REPLACE FUNCTION timestamp_id(table_name text)\\n RETURNS bigint AS\\n $$\\n DECLARE\\n time_part bigint;\\n sequence_base bigint;\\n tail bigint;\\n BEGIN\\n time_part := (\\n -- Get the time in milliseconds\\n ((date_part('epoch', now()) * 1000))::bigint\\n -- And shift it over two bytes\\n << 16);\\n\\n sequence_base := (\\n 'x' ||\\n -- Take the first two bytes (four hex characters)\\n substr(\\n -- Of the MD5 hash of the data we documented\\n md5(table_name ||\\n '#{SecureRandom.hex(16)}' ||\\n time_part::text\\n ),\\n 1, 4\\n )\\n -- And turn it into a bigint\\n )::bit(16)::bigint;\\n\\n -- Finally, add our sequence number to our base, and chop\\n -- it to the last two bytes\\n tail := (\\n (sequence_base + nextval(table_name || '_id_seq'))\\n & 65535);\\n\\n -- Return the time part and the sequence part. OR appears\\n -- faster here than addition, but they're equivalent:\\n -- time_part has no trailing two bytes, and tail is only\\n -- the last two bytes.\\n RETURN time_part | tail;\\n END\\n $$ LANGUAGE plpgsql VOLATILE;\\n\")", + "render_path": null, + "location": { + "type": "method", + "class": "Mastodon::Snowflake", + "method": "define_timestamp_id" + }, + "user_input": "SecureRandom.hex(16)", + "confidence": "Medium", + "note": "" + }, + { "warning_type": "Dynamic Render Path", "warning_code": 15, "fingerprint": "9f31d941f3910dba2e9bfcd81aef4513249bd24c02d0f98e13ad44fdeeccd0e8", @@ -269,6 +269,6 @@ "note": "" } ], - "updated": "2017-10-06 03:27:46 +0200", + "updated": "2017-10-07 19:24:02 +0200", "brakeman_version": "4.0.1" } diff --git a/lib/mastodon/snowflake.rb b/lib/mastodon/snowflake.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: true + +module Mastodon::Snowflake + DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/ + + class Callbacks + def self.around_create(record) + now = Time.now.utc + + if record.created_at.nil? || record.created_at >= now || record.created_at == record.updated_at + yield + else + record.id = Mastodon::Snowflake.id_at(record.created_at) + tries = 0 + + begin + yield + rescue ActiveRecord::RecordNotUnique + raise if tries > 100 + + tries += 1 + record.id += rand(100) + + retry + end + end + end + end + + class << self + # Our ID will be composed of the following: + # 6 bytes (48 bits) of millisecond-level timestamp + # 2 bytes (16 bits) of sequence data + # + # The 'sequence data' is intended to be unique within a + # given millisecond, yet obscure the 'serial number' of + # this row. + # + # To do this, we hash the following data: + # * Table name (if provided, skipped if not) + # * Secret salt (should not be guessable) + # * Timestamp (again, millisecond-level granularity) + # + # We then take the first two bytes of that value, and add + # the lowest two bytes of the table ID sequence number + # (`table_name`_id_seq). This means that even if we insert + # two rows at the same millisecond, they will have + # distinct 'sequence data' portions. + # + # If this happens, and an attacker can see both such IDs, + # they can determine which of the two entries was inserted + # first, but not the total number of entries in the table + # (even mod 2**16). + # + # The table name is included in the hash to ensure that + # different tables derive separate sequence bases so rows + # inserted in the same millisecond in different tables do + # not reveal the table ID sequence number for one another. + # + # The secret salt is included in the hash to ensure that + # external users cannot derive the sequence base given the + # timestamp and table name, which would allow them to + # compute the table ID sequence number. + def define_timestamp_id + return if already_defined? + + connection.execute(<<~SQL) + CREATE OR REPLACE FUNCTION timestamp_id(table_name text) + RETURNS bigint AS + $$ + DECLARE + time_part bigint; + sequence_base bigint; + tail bigint; + BEGIN + time_part := ( + -- Get the time in milliseconds + ((date_part('epoch', now()) * 1000))::bigint + -- And shift it over two bytes + << 16); + + sequence_base := ( + 'x' || + -- Take the first two bytes (four hex characters) + substr( + -- Of the MD5 hash of the data we documented + md5(table_name || + '#{SecureRandom.hex(16)}' || + time_part::text + ), + 1, 4 + ) + -- And turn it into a bigint + )::bit(16)::bigint; + + -- Finally, add our sequence number to our base, and chop + -- it to the last two bytes + tail := ( + (sequence_base + nextval(table_name || '_id_seq')) + & 65535); + + -- Return the time part and the sequence part. OR appears + -- faster here than addition, but they're equivalent: + -- time_part has no trailing two bytes, and tail is only + -- the last two bytes. + RETURN time_part | tail; + END + $$ LANGUAGE plpgsql VOLATILE; + SQL + end + + def ensure_id_sequences_exist + # Find tables using timestamp IDs. + connection.tables.each do |table| + # We're only concerned with "id" columns. + next unless (id_col = connection.columns(table).find { |col| col.name == 'id' }) + + # And only those that are using timestamp_id. + next unless (data = DEFAULT_REGEX.match(id_col.default_function)) + + seq_name = data[:seq_prefix] + '_id_seq' + + # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF + # NOT EXISTS, but we can't depend on that. Instead, catch the + # possible exception and ignore it. + # Note that seq_name isn't a column name, but it's a + # relation, like a column, and follows the same quoting rules + # in Postgres. + connection.execute(<<~SQL) + DO $$ + BEGIN + CREATE SEQUENCE #{connection.quote_column_name(seq_name)}; + EXCEPTION WHEN duplicate_table THEN + -- Do nothing, we have the sequence already. + END + $$ LANGUAGE plpgsql; + SQL + end + end + + def id_at(timestamp) + id = timestamp.to_i * 1000 + rand(1000) + id = id << 16 + id += rand(2**16) + id + end + + private + + def already_defined? + connection.execute(<<~SQL).values.first.first + SELECT EXISTS( + SELECT * FROM pg_proc WHERE proname = 'timestamp_id' + ); + SQL + end + + def connection + ActiveRecord::Base.connection + end + end +end diff --git a/lib/mastodon/timestamp_ids.rb b/lib/mastodon/timestamp_ids.rb @@ -1,131 +0,0 @@ -# frozen_string_literal: true - -module Mastodon::TimestampIds - DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/ - - class << self - # Our ID will be composed of the following: - # 6 bytes (48 bits) of millisecond-level timestamp - # 2 bytes (16 bits) of sequence data - # - # The 'sequence data' is intended to be unique within a - # given millisecond, yet obscure the 'serial number' of - # this row. - # - # To do this, we hash the following data: - # * Table name (if provided, skipped if not) - # * Secret salt (should not be guessable) - # * Timestamp (again, millisecond-level granularity) - # - # We then take the first two bytes of that value, and add - # the lowest two bytes of the table ID sequence number - # (`table_name`_id_seq). This means that even if we insert - # two rows at the same millisecond, they will have - # distinct 'sequence data' portions. - # - # If this happens, and an attacker can see both such IDs, - # they can determine which of the two entries was inserted - # first, but not the total number of entries in the table - # (even mod 2**16). - # - # The table name is included in the hash to ensure that - # different tables derive separate sequence bases so rows - # inserted in the same millisecond in different tables do - # not reveal the table ID sequence number for one another. - # - # The secret salt is included in the hash to ensure that - # external users cannot derive the sequence base given the - # timestamp and table name, which would allow them to - # compute the table ID sequence number. - def define_timestamp_id - return if already_defined? - - connection.execute(<<~SQL) - CREATE OR REPLACE FUNCTION timestamp_id(table_name text) - RETURNS bigint AS - $$ - DECLARE - time_part bigint; - sequence_base bigint; - tail bigint; - BEGIN - time_part := ( - -- Get the time in milliseconds - ((date_part('epoch', now()) * 1000))::bigint - -- And shift it over two bytes - << 16); - - sequence_base := ( - 'x' || - -- Take the first two bytes (four hex characters) - substr( - -- Of the MD5 hash of the data we documented - md5(table_name || - '#{SecureRandom.hex(16)}' || - time_part::text - ), - 1, 4 - ) - -- And turn it into a bigint - )::bit(16)::bigint; - - -- Finally, add our sequence number to our base, and chop - -- it to the last two bytes - tail := ( - (sequence_base + nextval(table_name || '_id_seq')) - & 65535); - - -- Return the time part and the sequence part. OR appears - -- faster here than addition, but they're equivalent: - -- time_part has no trailing two bytes, and tail is only - -- the last two bytes. - RETURN time_part | tail; - END - $$ LANGUAGE plpgsql VOLATILE; - SQL - end - - def ensure_id_sequences_exist - # Find tables using timestamp IDs. - connection.tables.each do |table| - # We're only concerned with "id" columns. - next unless (id_col = connection.columns(table).find { |col| col.name == 'id' }) - - # And only those that are using timestamp_id. - next unless (data = DEFAULT_REGEX.match(id_col.default_function)) - - seq_name = data[:seq_prefix] + '_id_seq' - - # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF - # NOT EXISTS, but we can't depend on that. Instead, catch the - # possible exception and ignore it. - # Note that seq_name isn't a column name, but it's a - # relation, like a column, and follows the same quoting rules - # in Postgres. - connection.execute(<<~SQL) - DO $$ - BEGIN - CREATE SEQUENCE #{connection.quote_column_name(seq_name)}; - EXCEPTION WHEN duplicate_table THEN - -- Do nothing, we have the sequence already. - END - $$ LANGUAGE plpgsql; - SQL - end - end - - private - - def already_defined? - connection.execute(<<~SQL).values.first.first - SELECT EXISTS( - SELECT * FROM pg_proc WHERE proname = 'timestamp_id' - ); - SQL - end - - def connection - ActiveRecord::Base.connection - end - end -end diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake @@ -1,6 +1,6 @@ # frozen_string_literal: true -require Rails.root.join('lib', 'mastodon', 'timestamp_ids') +require_relative '../mastodon/snowflake' def each_schema_load_environment # If we're in development, also run this for the test environment. @@ -63,13 +63,13 @@ namespace :db do task :define_timestamp_id do each_schema_load_environment do - Mastodon::TimestampIds.define_timestamp_id + Mastodon::Snowflake.define_timestamp_id end end task :ensure_id_sequences_exist do each_schema_load_environment do - Mastodon::TimestampIds.ensure_id_sequences_exist + Mastodon::Snowflake.ensure_id_sequences_exist end end end diff --git a/spec/services/activitypub/process_collection_service_spec.rb b/spec/services/activitypub/process_collection_service_spec.rb @@ -28,7 +28,7 @@ RSpec.describe ActivityPub::ProcessCollectionService do it 'processes payload with sender if no signature exists' do expect_any_instance_of(ActivityPub::LinkedDataSignature).not_to receive(:verify_account!) - expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), forwarder) + expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), forwarder, instance_of(Hash)) subject.call(json, forwarder) end @@ -37,7 +37,7 @@ RSpec.describe ActivityPub::ProcessCollectionService do payload['signature'] = {'type' => 'RsaSignature2017'} expect_any_instance_of(ActivityPub::LinkedDataSignature).to receive(:verify_account!).and_return(actor) - expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), actor) + expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), actor, instance_of(Hash)) subject.call(json, forwarder) end