commit: 0717d9b3e6904a4dcd5d2dc9e680cc5b21c50e51
parent: 6e4046fc3f3973ba0b6994930a8b58726e507003
Author: Eugen Rochko <eugen@zeonfederated.com>
Date: Sun, 8 Oct 2017 17:34:34 +0200
Set snowflake IDs for backdated statuses (#5260)
- Rename Mastodon::TimestampIds into Mastodon::Snowflake for clarity
- Skip for statuses coming from inbox, aka delivered in real-time
- Skip for statuses that claim to be from the future
Diffstat:
17 files changed, 213 insertions(+), 173 deletions(-)
diff --git a/app/lib/activitypub/activity.rb b/app/lib/activitypub/activity.rb
@@ -3,10 +3,11 @@
class ActivityPub::Activity
include JsonLdHelper
- def initialize(json, account)
+ def initialize(json, account, options = {})
@json = json
@account = account
@object = @json['object']
+ @options = options
end
def perform
@@ -14,9 +15,9 @@ class ActivityPub::Activity
end
class << self
- def factory(json, account)
+ def factory(json, account, options = {})
@json = json
- klass&.new(json, account)
+ klass&.new(json, account, options)
end
private
diff --git a/app/lib/activitypub/activity/announce.rb b/app/lib/activitypub/activity/announce.rb
@@ -15,8 +15,9 @@ class ActivityPub::Activity::Announce < ActivityPub::Activity
account: @account,
reblog: original_status,
uri: @json['id'],
- created_at: @json['published'] || Time.now.utc
+ created_at: @options[:override_timestamps] ? nil : @json['published']
)
+
distribute(status)
status
end
diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb
@@ -43,7 +43,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity
text: text_from_content || '',
language: language_from_content,
spoiler_text: @object['summary'] || '',
- created_at: @object['published'] || Time.now.utc,
+ created_at: @options[:override_timestamps] ? nil : @object['published'],
reply: @object['inReplyTo'].present?,
sensitive: @object['sensitive'] || false,
visibility: visibility_from_audience,
diff --git a/app/lib/ostatus/activity/base.rb b/app/lib/ostatus/activity/base.rb
@@ -1,9 +1,10 @@
# frozen_string_literal: true
class OStatus::Activity::Base
- def initialize(xml, account = nil)
- @xml = xml
+ def initialize(xml, account = nil, options = {})
+ @xml = xml
@account = account
+ @options = options
end
def status?
diff --git a/app/lib/ostatus/activity/creation.rb b/app/lib/ostatus/activity/creation.rb
@@ -34,7 +34,7 @@ class OStatus::Activity::Creation < OStatus::Activity::Base
reblog: cached_reblog,
text: content,
spoiler_text: content_warning,
- created_at: published,
+ created_at: @options[:override_timestamps] ? nil : published,
reply: thread?,
language: content_language,
visibility: visibility_scope,
diff --git a/app/lib/ostatus/activity/general.rb b/app/lib/ostatus/activity/general.rb
@@ -2,7 +2,7 @@
class OStatus::Activity::General < OStatus::Activity::Base
def specialize
- special_class&.new(@xml, @account)
+ special_class&.new(@xml, @account, @options)
end
private
diff --git a/app/models/status.rb b/app/models/status.rb
@@ -136,6 +136,8 @@ class Status < ApplicationRecord
after_create :store_uri, if: :local?
+ around_create Mastodon::Snowflake::Callbacks
+
before_validation :prepare_contents, if: :local?
before_validation :set_reblog
before_validation :set_visibility
diff --git a/app/services/activitypub/process_collection_service.rb b/app/services/activitypub/process_collection_service.rb
@@ -3,9 +3,10 @@
class ActivityPub::ProcessCollectionService < BaseService
include JsonLdHelper
- def call(body, account)
+ def call(body, account, options = {})
@account = account
@json = Oj.load(body, mode: :strict)
+ @options = options
return unless supported_context?
return if different_actor? && verify_account!.nil?
@@ -38,7 +39,7 @@ class ActivityPub::ProcessCollectionService < BaseService
end
def process_item(item)
- activity = ActivityPub::Activity.factory(item, @account)
+ activity = ActivityPub::Activity.factory(item, @account, @options)
activity&.perform
end
diff --git a/app/services/process_feed_service.rb b/app/services/process_feed_service.rb
@@ -1,7 +1,9 @@
# frozen_string_literal: true
class ProcessFeedService < BaseService
- def call(body, account)
+ def call(body, account, options = {})
+ @options = options
+
xml = Nokogiri::XML(body)
xml.encoding = 'utf-8'
@@ -20,7 +22,7 @@ class ProcessFeedService < BaseService
end
def process_entry(xml, account)
- activity = OStatus::Activity::General.new(xml, account)
+ activity = OStatus::Activity::General.new(xml, account, @options)
activity.specialize&.perform if activity.status?
rescue ActiveRecord::RecordInvalid => e
Rails.logger.debug "Nothing was saved for #{activity.id} because: #{e}"
diff --git a/app/workers/activitypub/processing_worker.rb b/app/workers/activitypub/processing_worker.rb
@@ -6,6 +6,6 @@ class ActivityPub::ProcessingWorker
sidekiq_options backtrace: true
def perform(account_id, body)
- ActivityPub::ProcessCollectionService.new.call(body, Account.find(account_id))
+ ActivityPub::ProcessCollectionService.new.call(body, Account.find(account_id), override_timestamps: true)
end
end
diff --git a/app/workers/processing_worker.rb b/app/workers/processing_worker.rb
@@ -6,6 +6,6 @@ class ProcessingWorker
sidekiq_options backtrace: true
def perform(account_id, body)
- ProcessFeedService.new.call(body, Account.find(account_id))
+ ProcessFeedService.new.call(body, Account.find(account_id), override_timestamps: true)
end
end
diff --git a/config/application.rb b/config/application.rb
@@ -9,6 +9,7 @@ Bundler.require(*Rails.groups)
require_relative '../app/lib/exceptions'
require_relative '../lib/paperclip/gif_transcoder'
require_relative '../lib/paperclip/video_transcoder'
+require_relative '../lib/mastodon/snowflake'
require_relative '../lib/mastodon/version'
Dotenv::Railtie.load
diff --git a/config/brakeman.ignore b/config/brakeman.ignore
@@ -58,26 +58,6 @@
"note": ""
},
{
- "warning_type": "SQL Injection",
- "warning_code": 0,
- "fingerprint": "34efc76883080f8b1110a30c34ec4f903946ee56651aae46c62477f45d4fc412",
- "check_name": "SQL",
- "message": "Possible SQL injection",
- "file": "lib/mastodon/timestamp_ids.rb",
- "line": 63,
- "link": "http://brakemanscanner.org/docs/warning_types/sql_injection/",
- "code": "connection.execute(\" CREATE OR REPLACE FUNCTION timestamp_id(table_name text)\\n RETURNS bigint AS\\n $$\\n DECLARE\\n time_part bigint;\\n sequence_base bigint;\\n tail bigint;\\n BEGIN\\n time_part := (\\n -- Get the time in milliseconds\\n ((date_part('epoch', now()) * 1000))::bigint\\n -- And shift it over two bytes\\n << 16);\\n\\n sequence_base := (\\n 'x' ||\\n -- Take the first two bytes (four hex characters)\\n substr(\\n -- Of the MD5 hash of the data we documented\\n md5(table_name ||\\n '#{SecureRandom.hex(16)}' ||\\n time_part::text\\n ),\\n 1, 4\\n )\\n -- And turn it into a bigint\\n )::bit(16)::bigint;\\n\\n -- Finally, add our sequence number to our base, and chop\\n -- it to the last two bytes\\n tail := (\\n (sequence_base + nextval(table_name || '_id_seq'))\\n & 65535);\\n\\n -- Return the time part and the sequence part. OR appears\\n -- faster here than addition, but they're equivalent:\\n -- time_part has no trailing two bytes, and tail is only\\n -- the last two bytes.\\n RETURN time_part | tail;\\n END\\n $$ LANGUAGE plpgsql VOLATILE;\\n\")",
- "render_path": null,
- "location": {
- "type": "method",
- "class": "Mastodon::TimestampIds",
- "method": "define_timestamp_id"
- },
- "user_input": "SecureRandom.hex(16)",
- "confidence": "Medium",
- "note": ""
- },
- {
"warning_type": "Dynamic Render Path",
"warning_code": 15,
"fingerprint": "3b0a20b08aef13cf8cf865384fae0cfd3324d8200a83262bf4abbc8091b5fec5",
@@ -106,7 +86,7 @@
"line": 3,
"link": "http://brakemanscanner.org/docs/warning_types/dynamic_render_path/",
"code": "render(action => \"stream_entries/#{Account.find_local!(params[:account_username]).statuses.find(params[:id]).stream_entry.activity_type.downcase}\", { Account.find_local!(params[:account_username]).statuses.find(params[:id]).stream_entry.activity_type.downcase.to_sym => Account.find_local!(params[:account_username]).statuses.find(params[:id]).stream_entry.activity, :centered => true })",
- "render_path": [{"type":"controller","class":"StatusesController","method":"embed","line":35,"file":"app/controllers/statuses_controller.rb"}],
+ "render_path": [{"type":"controller","class":"StatusesController","method":"embed","line":41,"file":"app/controllers/statuses_controller.rb"}],
"location": {
"type": "template",
"template": "stream_entries/embed"
@@ -154,6 +134,26 @@
"note": ""
},
{
+ "warning_type": "SQL Injection",
+ "warning_code": 0,
+ "fingerprint": "9ccb9ba6a6947400e187d515e0bf719d22993d37cfc123c824d7fafa6caa9ac3",
+ "check_name": "SQL",
+ "message": "Possible SQL injection",
+ "file": "lib/mastodon/snowflake.rb",
+ "line": 86,
+ "link": "http://brakemanscanner.org/docs/warning_types/sql_injection/",
+ "code": "connection.execute(\" CREATE OR REPLACE FUNCTION timestamp_id(table_name text)\\n RETURNS bigint AS\\n $$\\n DECLARE\\n time_part bigint;\\n sequence_base bigint;\\n tail bigint;\\n BEGIN\\n time_part := (\\n -- Get the time in milliseconds\\n ((date_part('epoch', now()) * 1000))::bigint\\n -- And shift it over two bytes\\n << 16);\\n\\n sequence_base := (\\n 'x' ||\\n -- Take the first two bytes (four hex characters)\\n substr(\\n -- Of the MD5 hash of the data we documented\\n md5(table_name ||\\n '#{SecureRandom.hex(16)}' ||\\n time_part::text\\n ),\\n 1, 4\\n )\\n -- And turn it into a bigint\\n )::bit(16)::bigint;\\n\\n -- Finally, add our sequence number to our base, and chop\\n -- it to the last two bytes\\n tail := (\\n (sequence_base + nextval(table_name || '_id_seq'))\\n & 65535);\\n\\n -- Return the time part and the sequence part. OR appears\\n -- faster here than addition, but they're equivalent:\\n -- time_part has no trailing two bytes, and tail is only\\n -- the last two bytes.\\n RETURN time_part | tail;\\n END\\n $$ LANGUAGE plpgsql VOLATILE;\\n\")",
+ "render_path": null,
+ "location": {
+ "type": "method",
+ "class": "Mastodon::Snowflake",
+ "method": "define_timestamp_id"
+ },
+ "user_input": "SecureRandom.hex(16)",
+ "confidence": "Medium",
+ "note": ""
+ },
+ {
"warning_type": "Dynamic Render Path",
"warning_code": 15,
"fingerprint": "9f31d941f3910dba2e9bfcd81aef4513249bd24c02d0f98e13ad44fdeeccd0e8",
@@ -269,6 +269,6 @@
"note": ""
}
],
- "updated": "2017-10-06 03:27:46 +0200",
+ "updated": "2017-10-07 19:24:02 +0200",
"brakeman_version": "4.0.1"
}
diff --git a/lib/mastodon/snowflake.rb b/lib/mastodon/snowflake.rb
@@ -0,0 +1,162 @@
+# frozen_string_literal: true
+
+module Mastodon::Snowflake
+ DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/
+
+ class Callbacks
+ def self.around_create(record)
+ now = Time.now.utc
+
+ if record.created_at.nil? || record.created_at >= now || record.created_at == record.updated_at
+ yield
+ else
+ record.id = Mastodon::Snowflake.id_at(record.created_at)
+ tries = 0
+
+ begin
+ yield
+ rescue ActiveRecord::RecordNotUnique
+ raise if tries > 100
+
+ tries += 1
+ record.id += rand(100)
+
+ retry
+ end
+ end
+ end
+ end
+
+ class << self
+ # Our ID will be composed of the following:
+ # 6 bytes (48 bits) of millisecond-level timestamp
+ # 2 bytes (16 bits) of sequence data
+ #
+ # The 'sequence data' is intended to be unique within a
+ # given millisecond, yet obscure the 'serial number' of
+ # this row.
+ #
+ # To do this, we hash the following data:
+ # * Table name (if provided, skipped if not)
+ # * Secret salt (should not be guessable)
+ # * Timestamp (again, millisecond-level granularity)
+ #
+ # We then take the first two bytes of that value, and add
+ # the lowest two bytes of the table ID sequence number
+ # (`table_name`_id_seq). This means that even if we insert
+ # two rows at the same millisecond, they will have
+ # distinct 'sequence data' portions.
+ #
+ # If this happens, and an attacker can see both such IDs,
+ # they can determine which of the two entries was inserted
+ # first, but not the total number of entries in the table
+ # (even mod 2**16).
+ #
+ # The table name is included in the hash to ensure that
+ # different tables derive separate sequence bases so rows
+ # inserted in the same millisecond in different tables do
+ # not reveal the table ID sequence number for one another.
+ #
+ # The secret salt is included in the hash to ensure that
+ # external users cannot derive the sequence base given the
+ # timestamp and table name, which would allow them to
+ # compute the table ID sequence number.
+ def define_timestamp_id
+ return if already_defined?
+
+ connection.execute(<<~SQL)
+ CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
+ RETURNS bigint AS
+ $$
+ DECLARE
+ time_part bigint;
+ sequence_base bigint;
+ tail bigint;
+ BEGIN
+ time_part := (
+ -- Get the time in milliseconds
+ ((date_part('epoch', now()) * 1000))::bigint
+ -- And shift it over two bytes
+ << 16);
+
+ sequence_base := (
+ 'x' ||
+ -- Take the first two bytes (four hex characters)
+ substr(
+ -- Of the MD5 hash of the data we documented
+ md5(table_name ||
+ '#{SecureRandom.hex(16)}' ||
+ time_part::text
+ ),
+ 1, 4
+ )
+ -- And turn it into a bigint
+ )::bit(16)::bigint;
+
+ -- Finally, add our sequence number to our base, and chop
+ -- it to the last two bytes
+ tail := (
+ (sequence_base + nextval(table_name || '_id_seq'))
+ & 65535);
+
+ -- Return the time part and the sequence part. OR appears
+ -- faster here than addition, but they're equivalent:
+ -- time_part has no trailing two bytes, and tail is only
+ -- the last two bytes.
+ RETURN time_part | tail;
+ END
+ $$ LANGUAGE plpgsql VOLATILE;
+ SQL
+ end
+
+ def ensure_id_sequences_exist
+ # Find tables using timestamp IDs.
+ connection.tables.each do |table|
+ # We're only concerned with "id" columns.
+ next unless (id_col = connection.columns(table).find { |col| col.name == 'id' })
+
+ # And only those that are using timestamp_id.
+ next unless (data = DEFAULT_REGEX.match(id_col.default_function))
+
+ seq_name = data[:seq_prefix] + '_id_seq'
+
+ # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
+ # NOT EXISTS, but we can't depend on that. Instead, catch the
+ # possible exception and ignore it.
+ # Note that seq_name isn't a column name, but it's a
+ # relation, like a column, and follows the same quoting rules
+ # in Postgres.
+ connection.execute(<<~SQL)
+ DO $$
+ BEGIN
+ CREATE SEQUENCE #{connection.quote_column_name(seq_name)};
+ EXCEPTION WHEN duplicate_table THEN
+ -- Do nothing, we have the sequence already.
+ END
+ $$ LANGUAGE plpgsql;
+ SQL
+ end
+ end
+
+ def id_at(timestamp)
+ id = timestamp.to_i * 1000 + rand(1000)
+ id = id << 16
+ id += rand(2**16)
+ id
+ end
+
+ private
+
+ def already_defined?
+ connection.execute(<<~SQL).values.first.first
+ SELECT EXISTS(
+ SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
+ );
+ SQL
+ end
+
+ def connection
+ ActiveRecord::Base.connection
+ end
+ end
+end
diff --git a/lib/mastodon/timestamp_ids.rb b/lib/mastodon/timestamp_ids.rb
@@ -1,131 +0,0 @@
-# frozen_string_literal: true
-
-module Mastodon::TimestampIds
- DEFAULT_REGEX = /timestamp_id\('(?<seq_prefix>\w+)'/
-
- class << self
- # Our ID will be composed of the following:
- # 6 bytes (48 bits) of millisecond-level timestamp
- # 2 bytes (16 bits) of sequence data
- #
- # The 'sequence data' is intended to be unique within a
- # given millisecond, yet obscure the 'serial number' of
- # this row.
- #
- # To do this, we hash the following data:
- # * Table name (if provided, skipped if not)
- # * Secret salt (should not be guessable)
- # * Timestamp (again, millisecond-level granularity)
- #
- # We then take the first two bytes of that value, and add
- # the lowest two bytes of the table ID sequence number
- # (`table_name`_id_seq). This means that even if we insert
- # two rows at the same millisecond, they will have
- # distinct 'sequence data' portions.
- #
- # If this happens, and an attacker can see both such IDs,
- # they can determine which of the two entries was inserted
- # first, but not the total number of entries in the table
- # (even mod 2**16).
- #
- # The table name is included in the hash to ensure that
- # different tables derive separate sequence bases so rows
- # inserted in the same millisecond in different tables do
- # not reveal the table ID sequence number for one another.
- #
- # The secret salt is included in the hash to ensure that
- # external users cannot derive the sequence base given the
- # timestamp and table name, which would allow them to
- # compute the table ID sequence number.
- def define_timestamp_id
- return if already_defined?
-
- connection.execute(<<~SQL)
- CREATE OR REPLACE FUNCTION timestamp_id(table_name text)
- RETURNS bigint AS
- $$
- DECLARE
- time_part bigint;
- sequence_base bigint;
- tail bigint;
- BEGIN
- time_part := (
- -- Get the time in milliseconds
- ((date_part('epoch', now()) * 1000))::bigint
- -- And shift it over two bytes
- << 16);
-
- sequence_base := (
- 'x' ||
- -- Take the first two bytes (four hex characters)
- substr(
- -- Of the MD5 hash of the data we documented
- md5(table_name ||
- '#{SecureRandom.hex(16)}' ||
- time_part::text
- ),
- 1, 4
- )
- -- And turn it into a bigint
- )::bit(16)::bigint;
-
- -- Finally, add our sequence number to our base, and chop
- -- it to the last two bytes
- tail := (
- (sequence_base + nextval(table_name || '_id_seq'))
- & 65535);
-
- -- Return the time part and the sequence part. OR appears
- -- faster here than addition, but they're equivalent:
- -- time_part has no trailing two bytes, and tail is only
- -- the last two bytes.
- RETURN time_part | tail;
- END
- $$ LANGUAGE plpgsql VOLATILE;
- SQL
- end
-
- def ensure_id_sequences_exist
- # Find tables using timestamp IDs.
- connection.tables.each do |table|
- # We're only concerned with "id" columns.
- next unless (id_col = connection.columns(table).find { |col| col.name == 'id' })
-
- # And only those that are using timestamp_id.
- next unless (data = DEFAULT_REGEX.match(id_col.default_function))
-
- seq_name = data[:seq_prefix] + '_id_seq'
-
- # If we were on Postgres 9.5+, we could do CREATE SEQUENCE IF
- # NOT EXISTS, but we can't depend on that. Instead, catch the
- # possible exception and ignore it.
- # Note that seq_name isn't a column name, but it's a
- # relation, like a column, and follows the same quoting rules
- # in Postgres.
- connection.execute(<<~SQL)
- DO $$
- BEGIN
- CREATE SEQUENCE #{connection.quote_column_name(seq_name)};
- EXCEPTION WHEN duplicate_table THEN
- -- Do nothing, we have the sequence already.
- END
- $$ LANGUAGE plpgsql;
- SQL
- end
- end
-
- private
-
- def already_defined?
- connection.execute(<<~SQL).values.first.first
- SELECT EXISTS(
- SELECT * FROM pg_proc WHERE proname = 'timestamp_id'
- );
- SQL
- end
-
- def connection
- ActiveRecord::Base.connection
- end
- end
-end
diff --git a/lib/tasks/db.rake b/lib/tasks/db.rake
@@ -1,6 +1,6 @@
# frozen_string_literal: true
-require Rails.root.join('lib', 'mastodon', 'timestamp_ids')
+require_relative '../mastodon/snowflake'
def each_schema_load_environment
# If we're in development, also run this for the test environment.
@@ -63,13 +63,13 @@ namespace :db do
task :define_timestamp_id do
each_schema_load_environment do
- Mastodon::TimestampIds.define_timestamp_id
+ Mastodon::Snowflake.define_timestamp_id
end
end
task :ensure_id_sequences_exist do
each_schema_load_environment do
- Mastodon::TimestampIds.ensure_id_sequences_exist
+ Mastodon::Snowflake.ensure_id_sequences_exist
end
end
end
diff --git a/spec/services/activitypub/process_collection_service_spec.rb b/spec/services/activitypub/process_collection_service_spec.rb
@@ -28,7 +28,7 @@ RSpec.describe ActivityPub::ProcessCollectionService do
it 'processes payload with sender if no signature exists' do
expect_any_instance_of(ActivityPub::LinkedDataSignature).not_to receive(:verify_account!)
- expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), forwarder)
+ expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), forwarder, instance_of(Hash))
subject.call(json, forwarder)
end
@@ -37,7 +37,7 @@ RSpec.describe ActivityPub::ProcessCollectionService do
payload['signature'] = {'type' => 'RsaSignature2017'}
expect_any_instance_of(ActivityPub::LinkedDataSignature).to receive(:verify_account!).and_return(actor)
- expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), actor)
+ expect(ActivityPub::Activity).to receive(:factory).with(instance_of(Hash), actor, instance_of(Hash))
subject.call(json, forwarder)
end