diff --git a/.rubocop.yml b/.rubocop.yml new file mode 100644 index 0000000..5874f4e --- /dev/null +++ b/.rubocop.yml @@ -0,0 +1,9 @@ +inherit_gem: + rubocop-config-umbrellio: lib/rubocop.yml + +AllCops: + DisplayCopNames: true + TargetRubyVersion: 2.3 + +Naming/UncommunicativeMethodParamName: + Enabled: false diff --git a/Gemfile b/Gemfile index 71c4607..fd36201 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,5 @@ +# frozen_string_literal: true + source "https://rubygems.org" # Specify your gem's dependencies in sequel-batches.gemspec diff --git a/README.md b/README.md index 692e857..246cfa7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# Sequel::Batches [![Build Status](https://travis-ci.org/umbrellio/sequel-batches.svg?branch=master)](https://travis-ci.org/umbrellio/sequel-batches) [![Coverage Status](https://coveralls.io/repos/github/umbrellio/sequel-batches/badge.svg?branch=master)](https://coveralls.io/github/umbrellio/sequel-batches?branch=master) +# Sequel::Batches    [![Gem Version](https://badge.fury.io/rb/sequel-batches.svg)](https://badge.fury.io/rb/sequel-batches) [![Build Status](https://travis-ci.org/umbrellio/sequel-batches.svg?branch=master)](https://travis-ci.org/umbrellio/sequel-batches) [![Coverage Status](https://coveralls.io/repos/github/umbrellio/sequel-batches/badge.svg?branch=master)](https://coveralls.io/github/umbrellio/sequel-batches?branch=master) -This dataset extension provides the method #in_batches. The method splits dataset in parts and yields it. +This dataset extension provides the `#in_batches` method. The method splits dataset in parts and yields it. Note: currently only PostgreSQL database is supported. @@ -38,13 +38,13 @@ Or install it yourself as: ## Usage -In order to use the feature you should enable the extension +In order to use the feature you should enable the extension: ```ruby Sequel::DATABASES.first.extension :batches ``` -And then the method becomes available on dataset +After that the `#in_batches` method becomes available on dataset: ```ruby User.where(role: "admin").in_batches(of: 4) do |ds| @@ -52,28 +52,29 @@ User.where(role: "admin").in_batches(of: 4) do |ds| end ``` -Finally, here's an example including all the available options +Finally, here's an example including all the available options: ```ruby -Event.where(type: "login").in_batches(of: 4, pk: [:project_id, :external_user_id], start: { project_id: 2, external_user_id: 3 }, finish: { project_id: 5, external_user_id: 70 }) do |ds| +options = { + of: 4, + pk: [:project_id, :external_user_id], + start: { project_id: 2, external_user_id: 3 }, + finish: { project_id: 5, external_user_id: 70 }, +} + +Event.where(type: "login").in_batches(options) do |ds| ds.delete end ``` -## Development - -After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. - -To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). - ## Contributing -Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/sequel-batches. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct. +Bug reports and pull requests are welcome on GitHub at https://github.com/umbrellio/sequel-batches. ## License The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). -## Code of Conduct - -Everyone interacting in the Sequel::Batches project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/sequel-batches/blob/master/CODE_OF_CONDUCT.md). + +Supported by Umbrellio + diff --git a/Rakefile b/Rakefile index b7e9ed5..1563230 100644 --- a/Rakefile +++ b/Rakefile @@ -1,6 +1,10 @@ +# frozen_string_literal: true + require "bundler/gem_tasks" require "rspec/core/rake_task" +require "rubocop/rake_task" RSpec::Core::RakeTask.new(:spec) +RuboCop::RakeTask.new(:lint) -task :default => :spec +task default: %i[lint spec] diff --git a/lib/sequel/extensions/batches.rb b/lib/sequel/extensions/batches.rb index f44795a..3e9276e 100644 --- a/lib/sequel/extensions/batches.rb +++ b/lib/sequel/extensions/batches.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + module Sequel module Extensions module Batches @@ -5,55 +7,13 @@ module Batches NullPKError = Class.new(StandardError) InvalidPKError = Class.new(StandardError) - def in_batches(pk: nil, of: 1000, start: nil, finish: nil) - pk ||= db.schema(first_source).select { |x| x[1][:primary_key] }.map(&:first) - raise MissingPKError if pk.empty? - - qualified_pk = pk.map { |x| Sequel[first_source][x] } - - check_pk = lambda do |input_pk| - raise InvalidPKError if input_pk.keys != pk - input_pk - end - - conditions = lambda do |pk, sign:| - raise NullPKError if pk.values.any?(&:nil?) - row_expr = Sequel.function(:row, *pk.values) - Sequel.function(:row, *qualified_pk).public_send(sign, row_expr) - end - - base_ds = order(*qualified_pk) - base_ds = base_ds.where(conditions.call(check_pk.call(start), sign: :>=)) if start - base_ds = base_ds.where(conditions.call(check_pk.call(finish), sign: :<=)) if finish - - pk_ds = db.from(base_ds).select(*pk).order(*pk) - actual_start = pk_ds.first - actual_finish = pk_ds.last - - return unless actual_start && actual_finish - - base_ds = base_ds.where(conditions.call(actual_start, sign: :>=)) - base_ds = base_ds.where(conditions.call(actual_finish, sign: :<=)) - - current_instance = nil - - loop do - if current_instance - working_ds = base_ds.where(conditions.call(current_instance.to_h, sign: :>)) - else - working_ds = base_ds - end - - current_instance = db.from(working_ds.limit(of)).select(*pk).order(*pk).last or break - working_ds = working_ds.where(conditions.call(current_instance.to_h, sign: :<=)) - - yield working_ds - end + def in_batches(**options, &block) + Sequel::Extensions::Batches::Yielder.new(ds: self, **options).call(&block) end - - private - - ::Sequel::Dataset.register_extension(:batches, Batches) end end end + +::Sequel::Dataset.register_extension(:batches, Sequel::Extensions::Batches) + +require_relative "batches/yielder" diff --git a/lib/sequel/extensions/batches/yielder.rb b/lib/sequel/extensions/batches/yielder.rb new file mode 100644 index 0000000..09eaf2d --- /dev/null +++ b/lib/sequel/extensions/batches/yielder.rb @@ -0,0 +1,82 @@ +# frozen_string_literal: true + +module Sequel::Extensions::Batches + class Yielder + attr_accessor :ds, :of, :start, :finish + attr_writer :pk + + def initialize(ds:, pk: nil, of: 1000, start: nil, finish: nil) + self.ds = ds + self.pk = pk + self.of = of + self.start = start + self.finish = finish + end + + def call + base_ds = setup_base_ds or return + + current_instance = nil + + loop do + working_ds = + if current_instance + base_ds.where(generate_conditions(current_instance.to_h, sign: :>)) + else + base_ds + end + + current_instance = db.from(working_ds.limit(of)).select(*pk).order(*pk).last or break + working_ds = working_ds.where(generate_conditions(current_instance.to_h, sign: :<=)) + + yield working_ds + end + end + + private + + def db + ds.db + end + + def pk + @pk ||= begin + pk = db.schema(ds.first_source).select { |x| x[1][:primary_key] }.map(&:first) + raise MissingPKError if pk.empty? + pk + end + end + + def qualified_pk + @qualified_pk ||= pk.map { |x| Sequel[ds.first_source][x] } + end + + def check_pk(input_pk) + raise InvalidPKError if input_pk.keys != pk + input_pk + end + + def generate_conditions(input_pk, sign:) + raise NullPKError if input_pk.values.any?(&:nil?) + row_expr = Sequel.function(:row, *input_pk.values) + Sequel.function(:row, *qualified_pk).public_send(sign, row_expr) + end + + def setup_base_ds + base_ds = ds.order(*qualified_pk) + base_ds = base_ds.where(generate_conditions(check_pk(start), sign: :>=)) if start + base_ds = base_ds.where(generate_conditions(check_pk(finish), sign: :<=)) if finish + + pk_ds = db.from(base_ds).select(*pk).order(*pk) + actual_start = pk_ds.first + actual_finish = pk_ds.last + + return unless actual_start && actual_finish + + base_ds = base_ds.where(generate_conditions(actual_start, sign: :>=)) + base_ds = base_ds.where(generate_conditions(actual_finish, sign: :<=)) + + base_ds + end + end +end diff --git a/sequel-batches.gemspec b/sequel-batches.gemspec index 190c443..7d00bcb 100644 --- a/sequel-batches.gemspec +++ b/sequel-batches.gemspec @@ -1,39 +1,29 @@ +# frozen_string_literal: true -lib = File.expand_path("../lib", __FILE__) +lib = File.expand_path("lib", __dir__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) Gem::Specification.new do |spec| - spec.name = "sequel-batches" - spec.version = "0.2.1" - spec.authors = ["fiscal-cliff", "umbrellio"] - spec.email = ["oss@umbrellio.biz"] + spec.name = "sequel-batches" + spec.version = "0.2.1" + spec.authors = ["fiscal-cliff", "umbrellio"] + spec.email = ["oss@umbrellio.biz"] - spec.summary = %q{The extension mimics AR5 batches api} - spec.description = %q{Allows you to split your dataset in batches} - spec.homepage = "https://github.com/umbrellio/sequel-batches" - spec.license = "MIT" + spec.summary = "The extension mimics AR5 batches api" + spec.description = "Allows you to split your dataset in batches" + spec.homepage = "https://github.com/umbrellio/sequel-batches" + spec.license = "MIT" - # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' - # to allow pushing to a single host or delete this section to allow pushing to any host. - if spec.respond_to?(:metadata) - spec.metadata["allowed_push_host"] = "https://rubygems.org" - else - raise "RubyGems 2.0 or newer is required to protect against " \ - "public gem pushes." - end - - spec.files = `git ls-files -z`.split("\x0").reject do |f| - f.match(%r{^(test|spec|features)/}) - end - spec.bindir = "exe" - spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } + spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } spec.require_paths = ["lib"] + spec.add_runtime_dependency "sequel" + spec.add_development_dependency "bundler" spec.add_development_dependency "coveralls" spec.add_development_dependency "pry" spec.add_development_dependency "rake" spec.add_development_dependency "rspec" - - spec.add_runtime_dependency "sequel" + spec.add_development_dependency "rubocop-config-umbrellio" + spec.add_development_dependency "simplecov" end diff --git a/spec/sequel/extensions/batches_spec.rb b/spec/sequel/extensions/batches_spec.rb index d455f0b..7e4cc62 100644 --- a/spec/sequel/extensions/batches_spec.rb +++ b/spec/sequel/extensions/batches_spec.rb @@ -1,14 +1,16 @@ +# frozen_string_literal: true + RSpec.describe "Sequel::Extensions::Batches" do let(:chunks) { [] } it "splits 6 records in 2 chunks" do DB[:data].in_batches(of: 3) { |b| chunks << b.select_map(:id) } - expect(chunks).to eq([[1,2,3], [4,5,6]]) + expect(chunks).to eq([[1, 2, 3], [4, 5, 6]]) end it "splits 6 records in 3 chunks" do DB[:data].in_batches(of: 2) { |b| chunks << b.select_map(:id) } - expect(chunks).to eq([[1,2], [3,4], [5,6]]) + expect(chunks).to eq([[1, 2], [3, 4], [5, 6]]) end it "splits 6 records in 6 chunks" do @@ -17,34 +19,37 @@ end it "starts from 4" do - DB[:data].in_batches(of: 1, start: {id: 4}) { |b| chunks << b.select_map(:id) } + DB[:data].in_batches(of: 1, start: { id: 4 }) { |b| chunks << b.select_map(:id) } expect(chunks).to eq([[4], [5], [6]]) end it "ends on 3" do - DB[:data].in_batches(of: 1, finish: {id: 3}) { |b| chunks << b.select_map(:id) } + DB[:data].in_batches(of: 1, finish: { id: 3 }) { |b| chunks << b.select_map(:id) } expect(chunks).to eq([[1], [2], [3]]) end it "uses another column" do - DB[:data].in_batches(pk: [:created_at], start: { created_at: "2017-05-01" } ) { |b| chunks << b.select_map(:id) } + DB[:data].in_batches(pk: [:created_at], start: { created_at: "2017-05-01" }) do |b| + chunks << b.select_map(:id) + end + expect(chunks).to eq([[3, 4, 5, 6]]) end it "works correctly with composite pk" do - DB[:data].in_batches(pk: [:id, :value], of: 3) { |b| chunks << b.select_map(:id) } + DB[:data].in_batches(pk: %i[id value], of: 3) { |b| chunks << b.select_map(:id) } expect(chunks).to eq([[1, 2, 3], [4, 5, 6]]) end it "works correctly with composite pk on reversed pk" do - DB[:data].in_batches(pk: [:value, :id], of: 2) { |b| chunks << b.select_map(:id) } + DB[:data].in_batches(pk: %i[value id], of: 2) { |b| chunks << b.select_map(:id) } expect(chunks).to eq([[1, 2], [3, 4], [5, 6]]) end it "raises NullPKError in case of pk containing nulls" do DB[:data].where(id: [1, 2, 3]).update(value: nil, created_at: nil) - expect { DB[:data].in_batches(pk: [:id, :value, :created_at], of: 3) {} } + expect { DB[:data].in_batches(pk: %i[id value created_at], of: 3) {} } .to raise_error(Sequel::Extensions::Batches::NullPKError) end @@ -60,22 +65,22 @@ end it "does nothing if start is too high" do - DB[:data].in_batches(start: {id: 100}) { |b| chunks << b.select_map(:id) } + DB[:data].in_batches(start: { id: 100 }) { |b| chunks << b.select_map(:id) } expect(chunks).to eq([]) end it "works correctly with real composite pk" do - DB[:points].in_batches(pk: %i[x y z]) { |b| chunks << b.select_map([:x, :y, :z]) } + DB[:points].in_batches(pk: %i[x y z]) { |b| chunks << b.select_map(%i[x y z]) } expect(chunks).to eq([[[15, 15, 15], [15, 20, 20]]]) end it "works correctly with real composite pk and small of" do - DB[:points].in_batches(pk: %i[x y z], of: 1) { |b| chunks << b.select_map([:x, :y, :z]) } + DB[:points].in_batches(pk: %i[x y z], of: 1) { |b| chunks << b.select_map(%i[x y z]) } expect(chunks).to eq([[[15, 15, 15]], [[15, 20, 20]]]) end it "raises InvalidPKError in case of incorrect key ordering in start" do - expect { DB[:points].in_batches(pk: %i[x y z], start: {y: 16, z: 100, x: 15}) {} } + expect { DB[:points].in_batches(pk: %i[x y z], start: { y: 16, z: 100, x: 15 }) {} } .to raise_error(Sequel::Extensions::Batches::InvalidPKError) end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index da38995..8f2648f 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,3 +1,20 @@ +# frozen_string_literal: true + +def is_jruby? + RUBY_ENGINE == "jruby" +end + +require "simplecov" +require "coveralls" + +SimpleCov.formatter = SimpleCov::Formatter::MultiFormatter.new([ + SimpleCov::Formatter::HTMLFormatter, + Coveralls::SimpleCov::Formatter, +]) + +SimpleCov.minimum_coverage(100) unless is_jruby? +SimpleCov.start + RSpec.configure do |config| # Enable flags like --only-failures and --next-failure config.example_status_persistence_file_path = ".rspec_status" @@ -10,22 +27,18 @@ end end -require "coveralls" - -Coveralls.wear! - require "bundler/setup" require "sequel" require "logger" -DB_NAME = (ENV['DB_NAME'] || "batches_test").freeze +DB_NAME = (ENV["DB_NAME"] || "batches_test").freeze def connect - jruby = (defined?(RUBY_ENGINE) && RUBY_ENGINE == 'jruby') || defined?(JRUBY_VERSION) - schema = jruby ? "jdbc:postgresql" : "postgres" + schema = is_jruby? ? "jdbc:postgresql" : "postgres" Sequel.connect("#{schema}:///#{DB_NAME}").tap(&:tables) -rescue Sequel::DatabaseConnectionError => e - raise unless e.message.include? "database \"#{DB_NAME}\" does not exist" +rescue Sequel::DatabaseConnectionError => error + raise unless error.message.include? "database \"#{DB_NAME}\" does not exist" + `createdb #{DB_NAME}` Sequel.connect("#{schema}:///#{DB_NAME}") end @@ -47,7 +60,7 @@ def connect column :value, "int" end - DB[:data].multi_insert(YAML.load(IO.read("spec/fixtures/data.yml"))) + DB[:data].multi_insert(YAML.load_file("./spec/fixtures/data.yml")) DB.drop_table?(:points) DB.create_table?(:points) do @@ -56,7 +69,7 @@ def connect column :z, "int" end - DB[:points].multi_insert(YAML.load(IO.read("spec/fixtures/points.yml"))) + DB[:points].multi_insert(YAML.load_file("./spec/fixtures/points.yml")) end config.expect_with :rspec do |c|