From 14629595596ffffdb9c308f51a7418f3912185cc Mon Sep 17 00:00:00 2001 From: kspurgin Date: Tue, 31 Aug 2021 16:15:05 -0400 Subject: [PATCH] Job setup and testing; new transforms (#33) * make application configurable here, or from projects using it * handle special options for destinations * handle namespaced file registry keys * add FileRegistryEntry class to handle settings in one place and validate * use pry for console * implement dry-container as file registry and customize it to create/return FileRegistryEntry objects * separate Job from BaseJob; add TestingJob * add Deduplicate::Table and Extract::Fields transforms * add documentation and update tests to new format * add ExampleFormatter method to help create documentation --- .gitignore | 3 + .rubocop.yml | 4 + .yardopts | 3 +- Gemfile.lock | 28 +- bin/console | 8 +- doc/file_registry_entry.md | 101 +++++++ kiba-extend.gemspec | 5 +- lib/kiba/extend.rb | 44 ++- lib/kiba/extend/jobs.rb | 12 + lib/kiba/extend/jobs/base_job.rb | 130 +++++++++ lib/kiba/extend/jobs/dependency_job.rb | 48 ++++ lib/kiba/extend/jobs/job.rb | 48 ++++ lib/kiba/extend/jobs/job_segment.rb | 13 + lib/kiba/extend/jobs/parser.rb | 17 ++ lib/kiba/extend/jobs/reporter.rb | 126 ++++++++ lib/kiba/extend/jobs/runner.rb | 128 +++++++++ lib/kiba/extend/jobs/show_me_job.rb | 27 ++ lib/kiba/extend/jobs/tell_me_job.rb | 26 ++ lib/kiba/extend/jobs/testing_job.rb | 82 ++++++ lib/kiba/extend/registry/file_registry.rb | 81 ++++++ .../extend/registry/file_registry_entry.rb | 141 +++++++++ .../extend/registry/registered_destination.rb | 66 +++++ lib/kiba/extend/registry/registered_file.rb | 38 +++ lib/kiba/extend/registry/registered_lookup.rb | 47 +++ lib/kiba/extend/registry/registered_source.rb | 26 ++ .../registry/registry_entry_selector.rb | 46 +++ lib/kiba/extend/registry/registry_list.rb | 13 + .../extend/registry/registry_validator.rb | 67 +++++ lib/kiba/extend/registry/requirable_file.rb | 27 ++ .../extend/registry/source_dest_registry.rb | 63 ++++ lib/kiba/extend/sources/file_set.rb | 34 +++ lib/kiba/extend/transforms/deduplicate.rb | 264 ++++++++++++++++- lib/kiba/extend/transforms/extract.rb | 101 +++++++ lib/kiba/extend/transforms/helpers.rb | 10 +- lib/kiba/extend/transforms/split.rb | 10 +- lib/kiba/extend/utils/lookup.rb | 11 +- lib/kiba/extend/version.rb | 2 +- spec/fixtures/base_job_base.csv | 3 + spec/fixtures/base_job_lookup.csv | 3 + spec/fixtures/existing.csv | 3 + spec/helpers.rb | 172 ++++++++++- spec/kiba/extend/jobs/job_spec.rb | 71 +++++ spec/kiba/extend/jobs/testing_job_spec.rb | 35 +++ .../registry/file_registry_entry_spec.rb | 82 ++++++ .../extend/registry/file_registry_spec.rb | 82 ++++++ .../registry/registered_destination_spec.rb | 100 +++++++ .../extend/registry/registered_file_spec.rb | 35 +++ .../extend/registry/registered_lookup_spec.rb | 45 +++ .../extend/registry/registered_source_spec.rb | 54 ++++ .../registry/registry_entry_selector_spec.rb | 73 +++++ .../registry/registry_validator_spec.rb | 20 ++ .../extend/registry/requirable_file_spec.rb | 46 +++ spec/kiba/extend/sources/file_set_spec.rb | 66 +++++ spec/kiba/extend/transforms/append_spec.rb | 1 - spec/kiba/extend/transforms/clean_spec.rb | 8 +- .../extend/transforms/deduplicate_spec.rb | 270 +++++++++++------- spec/kiba/extend/transforms/extract_spec.rb | 57 ++++ spec/kiba/extend/transforms/helpers_spec.rb | 18 +- spec/kiba/extend/transforms/merge_spec.rb | 1 - spec/kiba/extend/transforms/prepend_spec.rb | 8 +- spec/kiba/extend/transforms/split_spec.rb | 54 ++-- spec/kiba/extend/transforms/take_spec.rb | 17 +- spec/kiba/extend/utils/lookup_spec.rb | 15 - 63 files changed, 3041 insertions(+), 198 deletions(-) create mode 100644 doc/file_registry_entry.md create mode 100644 lib/kiba/extend/jobs.rb create mode 100644 lib/kiba/extend/jobs/base_job.rb create mode 100644 lib/kiba/extend/jobs/dependency_job.rb create mode 100644 lib/kiba/extend/jobs/job.rb create mode 100644 lib/kiba/extend/jobs/job_segment.rb create mode 100644 lib/kiba/extend/jobs/parser.rb create mode 100644 lib/kiba/extend/jobs/reporter.rb create mode 100644 lib/kiba/extend/jobs/runner.rb create mode 100644 lib/kiba/extend/jobs/show_me_job.rb create mode 100644 lib/kiba/extend/jobs/tell_me_job.rb create mode 100644 lib/kiba/extend/jobs/testing_job.rb create mode 100644 lib/kiba/extend/registry/file_registry.rb create mode 100644 lib/kiba/extend/registry/file_registry_entry.rb create mode 100644 lib/kiba/extend/registry/registered_destination.rb create mode 100644 lib/kiba/extend/registry/registered_file.rb create mode 100644 lib/kiba/extend/registry/registered_lookup.rb create mode 100644 lib/kiba/extend/registry/registered_source.rb create mode 100644 lib/kiba/extend/registry/registry_entry_selector.rb create mode 100644 lib/kiba/extend/registry/registry_list.rb create mode 100644 lib/kiba/extend/registry/registry_validator.rb create mode 100644 lib/kiba/extend/registry/requirable_file.rb create mode 100644 lib/kiba/extend/registry/source_dest_registry.rb create mode 100644 lib/kiba/extend/sources/file_set.rb create mode 100644 lib/kiba/extend/transforms/extract.rb create mode 100644 spec/fixtures/base_job_base.csv create mode 100644 spec/fixtures/base_job_lookup.csv create mode 100644 spec/fixtures/existing.csv create mode 100644 spec/kiba/extend/jobs/job_spec.rb create mode 100644 spec/kiba/extend/jobs/testing_job_spec.rb create mode 100644 spec/kiba/extend/registry/file_registry_entry_spec.rb create mode 100644 spec/kiba/extend/registry/file_registry_spec.rb create mode 100644 spec/kiba/extend/registry/registered_destination_spec.rb create mode 100644 spec/kiba/extend/registry/registered_file_spec.rb create mode 100644 spec/kiba/extend/registry/registered_lookup_spec.rb create mode 100644 spec/kiba/extend/registry/registered_source_spec.rb create mode 100644 spec/kiba/extend/registry/registry_entry_selector_spec.rb create mode 100644 spec/kiba/extend/registry/registry_validator_spec.rb create mode 100644 spec/kiba/extend/registry/requirable_file_spec.rb create mode 100644 spec/kiba/extend/sources/file_set_spec.rb create mode 100644 spec/kiba/extend/transforms/extract_spec.rb diff --git a/.gitignore b/.gitignore index 802a9756a..e79de1a20 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ # rspec failure tracking .rspec_status +.byebug_history + +**/.~lock* \ No newline at end of file diff --git a/.rubocop.yml b/.rubocop.yml index d219daf7b..c0d957875 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -9,3 +9,7 @@ Metrics/BlockLength: - spec/kiba/extend/**/* Naming/MethodParameterName: AllowedNames: i, v +Style/ModuleFunction: + Exclude: + # alias_method doesn't work if extend self is changed to module_function + - lib/kiba/extend/utils/lookup.rb diff --git a/.yardopts b/.yardopts index c7cb7f9f2..76c3f6a69 100644 --- a/.yardopts +++ b/.yardopts @@ -5,4 +5,5 @@ --no-private --markup markdown - -LICENSE.txt \ No newline at end of file +LICENSE.txt +doc/file_registry_entry.md \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 8a5fb7cb6..b0e82b010 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,9 +1,11 @@ PATH remote: . specs: - kiba-extend (2.1.1) + kiba-extend (2.2.0) activesupport (~> 6.1.4) csv (~> 3.0) + dry-configurable (~> 0.11) + dry-container (~> 0.8) kiba (~> 4.0.0) kiba-common (~> 1.5.0) xxhash (~> 0.4) @@ -11,30 +13,39 @@ PATH GEM remote: https://rubygems.org/ specs: - activesupport (6.1.4) + activesupport (6.1.4.1) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 1.6, < 2) minitest (>= 5.1) tzinfo (~> 2.0) zeitwerk (~> 2.3) ast (2.4.2) - coderay (1.1.2) + byebug (11.1.3) + coderay (1.1.3) concurrent-ruby (1.1.9) csv (3.2.0) diff-lcs (1.3) + dry-configurable (0.12.1) + concurrent-ruby (~> 1.0) + dry-core (~> 0.5, >= 0.5.0) + dry-container (0.8.0) + concurrent-ruby (~> 1.0) + dry-configurable (~> 0.1, >= 0.1.3) + dry-core (0.7.1) + concurrent-ruby (~> 1.0) i18n (1.8.10) concurrent-ruby (~> 1.0) kiba (4.0.0) kiba-common (1.5.0) kiba (>= 3.0.0, < 5) - method_source (0.9.2) + method_source (1.0.0) minitest (5.14.4) parallel (1.20.1) parser (3.0.2.0) ast (~> 2.4.1) - pry (0.12.2) - coderay (~> 1.1.0) - method_source (~> 0.9.0) + pry (0.14.1) + coderay (~> 1.1) + method_source (~> 1.0) rainbow (3.0.0) rake (13.0.1) regexp_parser (2.1.1) @@ -78,8 +89,9 @@ PLATFORMS DEPENDENCIES bundler (>= 1.17) + byebug (~> 11.0) kiba-extend! - pry (~> 0.12.2) + pry (~> 0.14) rake (~> 13.0) rspec (~> 3.0) rubocop (~> 1.18.4) diff --git a/bin/console b/bin/console index 382ce4184..2551fd4ee 100755 --- a/bin/console +++ b/bin/console @@ -7,9 +7,5 @@ require 'kiba/extend' # You can add fixtures and/or initialization code here to make experimenting # with your gem easier. You can also use a different console, if you like. -# (If you use this, don't forget to add pry to your Gemfile!) -# require "pry" -# Pry.start - -require 'irb' -IRB.start(__FILE__) +require 'pry' +Pry.start diff --git a/doc/file_registry_entry.md b/doc/file_registry_entry.md new file mode 100644 index 000000000..efef06ad4 --- /dev/null +++ b/doc/file_registry_entry.md @@ -0,0 +1,101 @@ +# File Registry Entry + +## PATH_REQ + +Constant registering the known source/destination classes and whether each requires a file path for read/write. + +If you create or incorporate a new source/destination class, you will get a warning if you use it and do not register it here. + +## reghash + +A file registry entry is initialized with a Hash of data about the file. This Hash will be sent from your ETL application. + +The allowable Hash keys, expected Hash value formats, and expectations about them are described below. + +**`:path` [String] full or expandable relative path to the expected location of the file** + +* default: `nil` +* required if either `:src_class` or `:dest_class` requires a path (in `PATH_REQ`) + +`:src_class` [Class] the Ruby class used to read in data + +* default: value of `Kiba::Extend.source` (`Kiba::Common::Sources::CSV` unless overridden by your ETL app) +* required, but default supplied if not given + +`:src_opt` [Hash] file options used when reading in source + +* default: value of `Kiba::Extend.csvopts` +* required, but default supplied if not given + +`:dest_class` [Class] the Ruby class used to write out the data + +* default: value of `Kiba::Extend.destination` (`Kiba::Extend::Destinations::CSV` unless overridden by your ETL app) +* required, but default supplied if not given + +`:dest_opt` [Hash] file options used when writing data + +* default: value of `Kiba::Extend.csvopts` +* required, but default supplied if not given + +`:dest_special_opts` [Hash] additional options for writing out the data + +* Not all destination classes support extra options. If you provide unsupported extra options, they will not be sent through to the destination class, and you will receive a warning in STDOUT. The current most common use is to define `initial_headers` (i.e. which columns should be first in file) to `Kiba::Extend::Destinations::CSV`. +* optional + +```ruby +reghash = { + path: '/path/to/file.csv', + dest_class: Kiba::Extend::Destinations::CSV, + dest_special_opts: { initial_headers: %i[objectnumber briefdescription] } + } +``` + +**`:creator` [Method] Ruby method that generates this file** + +* Used to run ETL jobs to create necessary files, if said files do not exist +* required unless file is supplied + +**`:supplied` [true, false] whether the file/data is supplied from outside the ETL** + +- default: false +- Manually set to true for: + - original data files from client + - mappings/reconciliations to be merged into the ETL/migration + - any other files created external to the ETL, which only need to be read from and never generated by the ETL process + +Both of the following are valid: + +```ruby +reghash = { + path: '/project/working/objects_prep.csv', + creator: Project::ClientData::ObjectTable.method(:prep) +} + +reghash = { + path: '/project/clientData/objects.csv', + supplied: true +} +``` + +Note the following pattern!: + + Class or Module constant name + `.method` + method name **as symbol** + +**`:lookup_on` [Symbol] column to use as keys in lookup table created from file data** + +* required if file is used as a lookup source +* You can register the same file multiple times under different file keys with different `:lookup_on` values if you need to use the data for different lookup purposes + +`:desc` [String] description of what the file is/what it is used for. Used when post-processing reports results to STDOUT + +* optional + +`:tags` [Array] list of arbitrary tags useful for categorizing data/jobs in your ETL + +* optional +* If set, you can filter to run only jobs tagged with a given tag +* Tags I commonly use: + * :report_problems - reports that indicate something unexpected or that I need to do more work + * :report_fyi - informational reports + * :cspace - final files ready to import + diff --git a/kiba-extend.gemspec b/kiba-extend.gemspec index d9e6bbc38..cd1756d3a 100644 --- a/kiba-extend.gemspec +++ b/kiba-extend.gemspec @@ -37,12 +37,15 @@ Gem::Specification.new do |spec| spec.add_dependency 'activesupport', '~> 6.1.4' spec.add_dependency 'csv', '~> 3.0' + spec.add_dependency 'dry-configurable', '~> 0.11' + spec.add_dependency 'dry-container', '~> 0.8' spec.add_dependency 'kiba', '~> 4.0.0' spec.add_dependency 'kiba-common', '~> 1.5.0' spec.add_dependency 'xxhash', '~> 0.4' spec.add_development_dependency 'bundler', '>= 1.17' - spec.add_development_dependency 'pry', '~> 0.12.2' + spec.add_development_dependency 'byebug', '~>11.0' + spec.add_development_dependency 'pry', '~> 0.14' spec.add_development_dependency 'rake', '~> 13.0' spec.add_development_dependency 'rspec', '~> 3.0' spec.add_development_dependency 'rubocop', '~> 1.18.4' diff --git a/lib/kiba/extend.rb b/lib/kiba/extend.rb index 7c032308f..2cd42d824 100644 --- a/lib/kiba/extend.rb +++ b/lib/kiba/extend.rb @@ -2,13 +2,17 @@ require 'active_support' require 'active_support/core_ext/object' +require 'dry-configurable' require 'kiba' require 'kiba-common/sources/csv' +require 'kiba-common/sources/enumerable' require 'kiba-common/destinations/csv' +require 'kiba-common/destinations/lambda' require 'pry' +require 'byebug' require 'xxhash' -require 'kiba/extend/version' +# require 'kiba/extend/version' # Default CSV options CSVOPT = { headers: true, header_converters: :symbol }.freeze @@ -24,13 +28,49 @@ module Kiba # Provides a suite of abstract, reusable, well-tested data transformations for use in Kiba ETL pipelines module Extend - puts "kiba-extend version: #{Kiba::Extend::VERSION}" + module_function + extend Dry::Configurable # Require application files Dir.glob("#{__dir__}/**/*").sort.select { |path| path.match?(/\.rb$/) }.each do |rbfile| require rbfile.delete_prefix("#{File.expand_path(__dir__)}/lib/") end + # So we can call Kiba.job_segment + Kiba.extend(Kiba::Extend::Jobs::JobSegmenter) + + # Default options for reading/writing CSVs + setting :csvopts, { headers: true, header_converters: %i[symbol downcase] }, reader: true + + # Default settings for Lambda destination + setting :lambdaopts, { on_write: ->(r) { accumulator << r } }, reader: true + + # Default delimiter for splitting/joining values in multi-valued fields + setting :delim, ';', reader: true + + # Default source class for jobs + setting :source, Kiba::Common::Sources::CSV, reader: true + + # Default destination class for jobs + setting :destination, Kiba::Extend::Destinations::CSV, reader: true + + # Prefix for warnings from the ETL + setting :warning_label, 'KIBA WARNING', reader: true + + setting :registry, Kiba::Extend::FileRegistry.new, reader: true + + setting :job, reader: true do + # Whether to output results to STDOUT for debugging + setting :show_me, false, reader: true + # Whether to have computer say something when job is complete + setting :tell_me, false, reader: true + # How much output about jobs to output to STDOUT + # :debug - tells you A LOT - helpful when developing pipelines and debugging + # :normal - reports what is running, from where, and the results + # :minimal - bare minimum + setting :verbosity, :normal, reader: true + end + # strips, collapses multiple spaces, removes terminal commas, strips again CSV::Converters[:stripplus] = lambda { |s| begin diff --git a/lib/kiba/extend/jobs.rb b/lib/kiba/extend/jobs.rb new file mode 100644 index 000000000..604159081 --- /dev/null +++ b/lib/kiba/extend/jobs.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +require_relative 'jobs/parser' + +Kiba::Extend::Jobs.extend(Kiba::Extend::Jobs::Parser) + +module Kiba + module Extend + module Jobs + end + end +end diff --git a/lib/kiba/extend/jobs/base_job.rb b/lib/kiba/extend/jobs/base_job.rb new file mode 100644 index 000000000..3c9353f39 --- /dev/null +++ b/lib/kiba/extend/jobs/base_job.rb @@ -0,0 +1,130 @@ +# frozen_string_literal: true + +require_relative 'runner' +require_relative 'parser' +require_relative 'show_me_job' +require_relative 'tell_me_job' +require_relative 'dependency_job' + +module Kiba + module Extend + # Reusable, composable patterns for jobs + # + # Heretofore, I have been repeating tons of code/logic for setting up a job in migration code: + # + # - Defining sources/destinations, @srcrows, @outrows + # - Changing CSV rows to hashes (initial transforms) + # - Changing hashes back to CSV rows + # - Calling postprocessing + # + # Most of this never changes, and when it does there is way too much tedious work in a given migration + # to make it consistent across all jobs. + # + # This is an attempt to dry up calling jobs and make it possible to test them automatically with stubbed-in + # enumerable sources/destinations + # + # Running `Kiba.parse` to define a job generates a {https://github.com/thbar/kiba/blob/master/lib/kiba/control.rb Kiba::Control} + # object, which is a wrapper bundling together: pre_processes, config, sources, transforms, destinations, and + # post_processes. + # + # As described {https://github.com/thbar/kiba/wiki/Implementing-pre-and-post-processors here}, pre_ and post_ + # processors get called once per ETL run---either before or after the ETL starts working through the source + # rows + # + # What Kiba::Extend::Jobs add is the ability to set up reusable initial_transformers and final_transformers. + # Basically, job templates where just the meat of the transformations change. + # + # `files` is the configuration of destination, source, and lookup files the job will use. It is a Hash, with + # the following format: + # + # { source: [registry_key, registry_key], destination: [registry_key], lookup: [registry_key] } + # + # { source: [registry_key, registry_key], destination: [registry_key]} + # + # `source` and `destination` must each have at least one registry key. `lookup` may be omitted, or it may + # be included with one or more registry keys + # + # `transformer` is a sequence of data transformations that could theoretically be called with interchangable + # input/output settings (i.e. `materials`). In practice, a `recipe` is usually closely tied to particular tables, because + # fields are manipulated by name. However, this should support easier automated testing of `recipes`. + # + # @since 2.2.0 + module Jobs + # Abstract definition of Job and job interface + # + # @abstract + # @return [Kiba::Control] + class BaseJob + include Runner + include Parser + + attr_reader :control, :context, :files, :transformer + + # @param files [Hash] + # @param transformer [Kiba::Control] + # @param show [Boolean] + def initialize(files:, transformer:) + @dependency = true if caller(2, 5).join(' ')['block in handle_requirements'] + extend DependencyJob if @dependency + + @files = setup_files(files) + report_run_start # defined in Reporter + @control = Kiba::Control.new + @context = Kiba::Context.new(control) + @transformer = transformer + handle_requirements # defined in Runner + assemble_control # defined in Runner + run + report_run_end # defined in Reporter + end + + def run + Kiba.run(control) + end + + private + + def job_data + @files[:destination].first.data + end + + # Replace file key names with registered_source/lookup/destination objects dynamically + def setup_files(files) + tmp = {} + files.each do |type, arr| + method = Kiba::Extend.registry.method("as_#{type}") + tmp[type] = [arr].flatten.map { |key| method.call(key) } + end + tmp + end + + def initial_transforms + Kiba.job_segment do + end + end + + def final_transforms + Kiba.job_segment do + end + end + + def pre_process + Kiba.job_segment do + end + end + + def config + Kiba.parse do + end.config + end + + def post_process + Kiba.job_segment do + post_process do + end + end + end + end + end + end +end diff --git a/lib/kiba/extend/jobs/dependency_job.rb b/lib/kiba/extend/jobs/dependency_job.rb new file mode 100644 index 000000000..37d9b657f --- /dev/null +++ b/lib/kiba/extend/jobs/dependency_job.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Jobs + # Mixin for different behavior for dependency jobs + module DependencyJob + extend Kiba::Extend::Jobs::Reporter + + # overrides Runner + def add_decoration + # Don't decorate dependency jobs + end + + # the rest overrides Reporter + def verbose_start + puts start_and_def + puts " #{desc_and_tags}" + end + + def normal_start + puts start_and_def + end + + def minimal_start + # silent for dependency jobs + end + + def verbose_end + puts " #{row_report} written to #{job_data.path}" + puts " NOTE: #{job_data.message.upcase}" if job_data.message + end + + def normal_end + # silent for dependency jobs + end + + def minimal_end + # silent for dependency jobs + end + + def start_label + '->Starting dependency job' + end + end + end + end +end diff --git a/lib/kiba/extend/jobs/job.rb b/lib/kiba/extend/jobs/job.rb new file mode 100644 index 000000000..6c3192449 --- /dev/null +++ b/lib/kiba/extend/jobs/job.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +require_relative 'base_job' + +module Kiba + module Extend + module Jobs + # Job with one source, one destination, and zero-to-n lookups + class Job < BaseJob + private + + def initial_transforms + Kiba.job_segment do + transform { |r| r.to_h } + transform { |r| @srcrows += 1; r } + end + end + + def final_transforms + Kiba.job_segment do + transform { |r| @outrows += 1; r } + end + end + + def pre_process + Kiba.job_segment do + pre_process do + @srcrows = 0 + @outrows = 0 + end + end + end + + def config + Kiba.parse do + end.config + end + + def post_process + Kiba.job_segment do + post_process do + end + end + end + end + end + end +end diff --git a/lib/kiba/extend/jobs/job_segment.rb b/lib/kiba/extend/jobs/job_segment.rb new file mode 100644 index 000000000..20be148f7 --- /dev/null +++ b/lib/kiba/extend/jobs/job_segment.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Jobs + module JobSegmenter + def job_segment(&source_as_block) + source_as_block + end + end + end + end +end diff --git a/lib/kiba/extend/jobs/parser.rb b/lib/kiba/extend/jobs/parser.rb new file mode 100644 index 000000000..2dbfc9d18 --- /dev/null +++ b/lib/kiba/extend/jobs/parser.rb @@ -0,0 +1,17 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Jobs + module Parser + def parse_job(control, context, *job_segments) + job_segments = job_segments.flatten + job_segments.compact.each { |segment| + context.instance_eval(&segment) + } + control + end + end + end + end +end diff --git a/lib/kiba/extend/jobs/reporter.rb b/lib/kiba/extend/jobs/reporter.rb new file mode 100644 index 000000000..f7ef25101 --- /dev/null +++ b/lib/kiba/extend/jobs/reporter.rb @@ -0,0 +1,126 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Jobs + # Mixin methods for reporting + module Reporter + def report_run_start + case Kiba::Extend.job.verbosity + when :verbose + verbose_start + return + when :minimal + minimal_start + return + else + normal_start + end + end + + def report_run_end + case Kiba::Extend.job.verbosity + when :verbose + verbose_end + return + when :minimal + minimal_end + return + else + normal_end + end + end + + def verbose_start + puts "\n-=-=-=-=-=-=-=-=-=-=-=-" + puts start_and_def + puts desc_and_tags + puts '' + put_file_details + end + + def normal_start + puts "\n-=-=-=-=-=-=-=-=-=-=-=-" + puts start_and_def + puts desc_and_tags + puts '' + end + + def minimal_start + puts "\n-=-=-=-=-=-=-=-=-=-=-=-" + puts start_and_def + end + + def verbose_end + puts "\n#{job_data.key} complete" + puts "#{row_report} written to #{job_data.path}" + puts "NOTE: #{job_data.message.upcase}" if job_data.message + puts '-=-=-=-=-=-=-=-=-=-=-=-' + puts '' + end + + def normal_end + puts "\n#{row_report} written to #{job_data.path}" + puts "NOTE: #{job_data.message.upcase}" if job_data.message + puts '-=-=-=-=-=-=-=-=-=-=-=-' + puts '' + end + + # silent + def minimal_end + puts row_report + puts '-=-=-=-=-=-=-=-=-=-=-=-' + puts '' + end + + def start_label + '->Starting dependency job' + end + + def creator_method_to_s + job_data.creator.to_s + .delete_prefix('#(r) { accumulator << r }}) + end + + def initial_transforms + nil + end + + def final_transforms + nil + end + + def pre_process + nil + end + + def config + Kiba.parse do + end.config + end + + def post_process + nil + end + + def handle_requirements + # no requirements + end + + def report_run_start + # no reporting + end + + def report_run_end + # no reporting + end + + def file_config(config) + [{klass: config.klass, args: [config.args]}] + end + + def sources + file_config(@files[:source]) + end + + def destinations + file_config(@files[:destination]) + end + end + end + end +end diff --git a/lib/kiba/extend/registry/file_registry.rb b/lib/kiba/extend/registry/file_registry.rb new file mode 100644 index 000000000..5f77ce5ab --- /dev/null +++ b/lib/kiba/extend/registry/file_registry.rb @@ -0,0 +1,81 @@ +# frozen_string_literal: true + +require 'dry-container' + +require_relative 'registered_source' +require_relative 'registered_lookup' +require_relative 'registered_destination' +require_relative 'file_registry_entry' +require_relative 'registry_validator' + +module Kiba + module Extend + # Transforms a file_registry hash into an object that can return source, lookup, or destination + # config for that file, for passing to jobs + # + # An example of a file registry setup in a project can be found at: + # https://github.com/lyrasis/fwm-cspace-migration/blob/main/lib/fwm/registry_data.rb + class FileRegistry + include Dry::Container::Mixin + + config.namespace_separator = '__' + + # Exception raised if the file key is not registered + class KeyNotRegisteredError < StandardError + # @param filekey [Symbol] + def initialize(filekey) + msg = "No file registered under the key: :#{filekey}" + super(msg) + end + end + + def as_destination(filekey) + Kiba::Extend::RegisteredDestination.new(key: filekey, data: lookup(filekey)) + end + + def as_lookup(filekey) + Kiba::Extend::RegisteredLookup.new(key: filekey, data: lookup(filekey)) + end + + def as_source(filekey) + Kiba::Extend::RegisteredSource.new(key: filekey, data: lookup(filekey)) + end + + def entries + @entries ||= populate_entries + end + + def transform + each { |key, val| decorate(key) { FileRegistryEntry.new(val) } } + @entries = populate_entries + each { |key, val| val.set_key(key) } + end + + def valid? + validator.valid? + end + + def warnings? + validator.warnings? + end + + private + + def lookup(key) + resolve(key) + rescue Dry::Container::Error + raise KeyNotRegisteredError, key + end + + def populate_entries + arr = [] + each { |entry| arr << entry[1] } + arr + end + + def validator + @validator ||= RegistryValidator.new + end + end + end +end diff --git a/lib/kiba/extend/registry/file_registry_entry.rb b/lib/kiba/extend/registry/file_registry_entry.rb new file mode 100644 index 000000000..d413a302a --- /dev/null +++ b/lib/kiba/extend/registry/file_registry_entry.rb @@ -0,0 +1,141 @@ +# frozen_string_literal: true + +require_relative 'source_dest_registry' + +module Kiba + module Extend + # Value object capturing the data about an entry in the file registry + # + # This is the underlying data that can be used to derive a registered source, + # destination, or lookup file object. + # + # Used instead of just passing around a Hash so that it can validate itself and + # carry its own errors/warnings + class FileRegistryEntry + include SourceDestRegistry + + attr_reader :path, :key, + :creator, :supplied, :dest_special_opts, :desc, :lookup_on, :tags, :message, + :dest_class, :dest_opt, :src_class, :src_opt, :type, + :valid, :errors, :warnings + + # allowed types + TYPES = :file, :fileset, :enum, :lambda + + # @param reghash [Hash] File data. See {file:doc/file_registry_entry.md} for details + def initialize(reghash) + set_defaults + assign_values_from(reghash) + validate + end + + def set_key(key) + @key = key + end + + def summary + lines = ["#{key} -- #{tags.join(', ')}"] + lines << " #{path}" if path + lines << " #{desc}" if desc + lines << " #{creator}" if creator + lines.join("\n") + end + + def valid? + valid + end + + private + + def allowed_settings + instance_variables + .map(&:to_s) + .map { |str| str.delete_prefix('@') } + .map(&:to_sym) + end + + def allowed_setting?(key) + allowed_settings.any?(key) + end + + def assign_value(key, val) + if allowed_setting?(key) + instance_variable_set("@#{key}".to_sym, val) + else + @warnings << ":#{key} is not an allowed FileRegistryEntry setting" + end + end + + def assign_values_from(reghash) + reghash.each { |key, val| assign_value(key, val) } + end + + def path_required? + chk = [dest_class, src_class].map { |klass| requires_path?(klass) } + return false if chk.uniq == [false] + + true + end + + def set_defaults + @type = :file + @creator = nil + @desc = '' + @dest_class = Kiba::Extend.destination + @dest_opt = Kiba::Extend.csvopts + @dest_special_opts = nil + @lookup_on = nil + @path = nil + @src_class = Kiba::Extend.source + @src_opt = Kiba::Extend.csvopts + @supplied = false + @tags = [] + @valid = false + @errors = {} + @warnings = [] + end + + def validate + validate_path + validate_creator + validate_type + @valid = true if errors.empty? + end + + def validate_creator + return if supplied + + validate_creator_present + validate_creator_is_method + end + + def validate_creator_is_method + return if creator.is_a?(Method) + + @errors[:creator_not_a_method] = creator.dup + @creator = nil + end + + def validate_creator_present + return if creator + + @errors[:missing_creator_for_non_supplied_file] = nil + end + + def validate_path + if path_required? && !path + @errors[:missing_path] = nil + return + end + + @path = Pathname.new(path) if path + end + + def validate_type + return if TYPES.any?(@type) + + @errors[:unknown_type] = @type + end + end + end +end diff --git a/lib/kiba/extend/registry/registered_destination.rb b/lib/kiba/extend/registry/registered_destination.rb new file mode 100644 index 000000000..174e58645 --- /dev/null +++ b/lib/kiba/extend/registry/registered_destination.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +require_relative 'registered_file' + +module Kiba + module Extend + # Value object representing a destination file registered in a {Kiba::Extend::FileRegistry} + class RegisteredDestination < RegisteredFile + # Arguments for calling Kiba Destination class + def args + return [simple_args] unless @data.dest_special_opts + + opts = supported_special_opts + warn_about_opts if opts.length < @data.dest_special_opts.length + return [simple_args] if opts.empty? + + [simple_args.merge(supported_special_opts)] + end + + # Description of file + # + # Used in post-processing STDOUT + def description + @data.desc + end + + # Info hash for file + # + # @deprecated Use {#description} and {#key} instead + def info + { filekey: @key, desc: description } + end + + # Kiba Destination class to call + def klass + @data.dest_class + end + + private + + def klass_opts + klass.instance_method(:initialize).parameters.map { |arr| arr[1] } + end + + def simple_args + return { filename: path }.merge(options_label(klass) => @data.dest_opt) if @data.dest_opt + + { filename: path }.merge(labeled_options(klass)) + end + + def supported_special_opts + @data.dest_special_opts.select { |key, _| klass_opts.any?(key) } + end + + def unsupported_special_opts + @data.dest_special_opts.reject { |key, _| klass_opts.any?(key) } + end + + def warn_about_opts + unsupported_special_opts.each do |opt, _| + puts "WARNING: Destination file :#{key} is called with special option :#{opt}, which is unsupported by #{klass}" + end + end + end + end +end diff --git a/lib/kiba/extend/registry/registered_file.rb b/lib/kiba/extend/registry/registered_file.rb new file mode 100644 index 000000000..030d1a9cd --- /dev/null +++ b/lib/kiba/extend/registry/registered_file.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require_relative 'source_dest_registry' + +module Kiba + module Extend + # Abstract base class defining interface for destination files, lookup files, and source files + # returned by {Kiba::Extend::FileRegistry} + class RegisteredFile + include SourceDestRegistry + # Exception raised if no path is given in {FileRegistry} hash + class NoFilePathError < StandardError + # @param filekey [Symbol] key for which a file path was not found in {Kiba::Extend::FileRegistry} + def initialize(filekey) + msg = "No file path for :#{filekey} is recorded in file registry hash" + super(msg) + end + end + + # @!attribute [r] key + # @return [Symbol] The file's key in {FileRegistry} hash + attr_reader :key, :data + + # @param key [Symbol] the {Kiba::Extend::FileRegistry} lookup key + # @param data [Hash] the hash of data for the file from {Kiba::Extend::FileRegistry} + def initialize(key:, data:) + raise FileNotRegisteredError, key unless data + raise NoFilePathError, key if data.errors.keys.any?(:missing_path) + + @key, @data = key, data + end + + def path + @data.path.to_s + end + end + end +end diff --git a/lib/kiba/extend/registry/registered_lookup.rb b/lib/kiba/extend/registry/registered_lookup.rb new file mode 100644 index 000000000..b8766044c --- /dev/null +++ b/lib/kiba/extend/registry/registered_lookup.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +require_relative 'registered_file' +require_relative 'requirable_file' + +module Kiba + module Extend + # Value object representing a file registered in a {Kiba::Extend::FileRegistry} that is being + # called into another job as a lookup table + # + # Assumes this file will be used to build a {Kiba::Extend::Lookup} + class RegisteredLookup < RegisteredFile + include RequirableFile + # Exception raised if {Kiba::Extend::FileRegistry} contains no lookup key for file + class NoLookupKeyError < StandardError + # @param filekey [Symbol] key not found in {Kiba::Extend::FileRegistry} + def initialize(filekey) + msg = "No lookup key column found for :#{filekey} in file registry hash" + super(msg) + end + end + + class NonSymbolLookupKeyError < StandardError + # @param filekey [Symbol] key not found in {Kiba::Extend::FileRegistry} + def initialize(filekey) + msg = "Lookup key found for :#{filekey} is not a Ruby Symbol. Prepend a : to the field name to fix." + super(msg) + end + end + + # @param key [Symbol] file key from {FileRegistry} data hash + # @param data [Hash] file data from {FileRegistry} + def initialize(key:, data:) + super + raise NoLookupKeyError, @key unless @data.lookup_on + raise NonSymbolLookupKeyError, @key unless @data.lookup_on.is_a?(Symbol) + end + + # Arguments for calling {Kiba::Extend::Lookup} with this file + # @return [Hash] + def args + opts = @data.src_opt ? @data.src_opt : file_options(@data.src_class) + { file: path, csvopt: opts, keycolumn: @data.lookup_on } + end + end + end +end diff --git a/lib/kiba/extend/registry/registered_source.rb b/lib/kiba/extend/registry/registered_source.rb new file mode 100644 index 000000000..5acb21a20 --- /dev/null +++ b/lib/kiba/extend/registry/registered_source.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require_relative 'registered_file' +require_relative 'requirable_file' + +module Kiba + module Extend + # Value object representing a file registered in a {Kiba::Extend::FileRegistry} that is being + # called into another job as a source table + class RegisteredSource < RegisteredFile + include RequirableFile + + # Arguments for calling Kiba Source class + # @return [Hash] + def args + opts = @data.src_opt ? { options_label(klass) => @data.src_opt } : labeled_options(klass) + [{ filename: path }.merge(opts)] + end + + # Kiba Source class to call + def klass + @data.src_class + end + end + end +end diff --git a/lib/kiba/extend/registry/registry_entry_selector.rb b/lib/kiba/extend/registry/registry_entry_selector.rb new file mode 100644 index 000000000..76ea64d42 --- /dev/null +++ b/lib/kiba/extend/registry/registry_entry_selector.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +module Kiba + module Extend + class RegistryEntrySelector + # @param cstr [String] stringified class name + def created_by_class(cstr) + with_creator.select { |entry| entry.creator.owner.to_s[cstr] } + end + + # @param mstr [String] stringified method name + def created_by_method(mstr) + matcher = "# false, + Kiba::Extend::Destinations::CSV => true, + Kiba::Common::Destinations::CSV => true, + Kiba::Common::Destinations::Lambda => false, + Kiba::Common::Sources::CSV => true, + Kiba::Common::Sources::Enumerable => false + } + data[klass] + end + + def file_options(klass) + data = { + nil => nil, + Kiba::Extend::Destinations::CSV => Kiba::Extend.csvopts, + Kiba::Common::Destinations::CSV => Kiba::Extend.csvopts, + Kiba::Common::Destinations::Lambda => Kiba::Extend.lambdaopts, + Kiba::Common::Sources::CSV => Kiba::Extend.csvopts, + Kiba::Common::Sources::Enumerable => nil + } + data[klass] + end + + def labeled_options(klass) + data = { + nil => nil, + Kiba::Extend::Destinations::CSV => { options_label(klass) => file_options(klass) }, + Kiba::Common::Destinations::CSV => { options_label(klass) => file_options(klass) }, + Kiba::Common::Destinations::Lambda => { options_label(klass) => file_options(klass) }, + Kiba::Common::Sources::CSV => { options_label(klass) => file_options(klass) }, + Kiba::Common::Sources::Enumerable => nil + } + data[klass] + end + + # The Symbol used for the options in the Kiba Source/Destination file configuration hash + def options_label(klass) + data = { + nil => nil, + Kiba::Extend::Destinations::CSV => :csv_options, + Kiba::Common::Destinations::CSV => :csv_options, + Kiba::Common::Destinations::Lambda => :options, + Kiba::Common::Sources::CSV => :csv_options, + Kiba::Common::Sources::Enumerable => nil + } + data[klass] + end + end + end +end diff --git a/lib/kiba/extend/sources/file_set.rb b/lib/kiba/extend/sources/file_set.rb new file mode 100644 index 000000000..f639ed69c --- /dev/null +++ b/lib/kiba/extend/sources/file_set.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module Kiba + module Extend + # Reusable data sources for use with Kiba + module Sources + # Selects multiple files from a directory, with an option to do so recursively. Supports include/exclude glob filters + class FileSet + attr_reader :files + + def initialize(path:, recursive: false, include: nil, exclude: nil) + @path = path + @recurse = recursive + @include = include + @exclude = exclude + @files = select_files + end + + private + + def select_files + if @recurse + files = Dir.glob("#{@path}/**/*").sort + else + files = Dir.children(@path).sort + end + files = files.select { |file| File.basename(file).match(Regexp.new(@include)) } if @include + files = files.reject { |file| File.basename(file).match(Regexp.new(@exclude)) } if @exclude + files + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/deduplicate.rb b/lib/kiba/extend/transforms/deduplicate.rb index b6e4b573c..90105e528 100644 --- a/lib/kiba/extend/transforms/deduplicate.rb +++ b/lib/kiba/extend/transforms/deduplicate.rb @@ -7,8 +7,82 @@ module Transforms module Deduplicate ::Deduplicate = Kiba::Extend::Transforms::Deduplicate + # Removes the value(s) of `source` from `targets` + # + # Input table: + # + # ``` + # | x | y | z | + # |-----+-----+-----| + # | a | a | b | + # | a | a | a | + # | a | b;a | a;c | + # | a;b | b;a | a;c | + # | a | aa | bat | + # | nil | a | nil | + # | | ;a | b; | + # | a | nil | nil | + # | a | A | a | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Deduplicate::Fields, source: :x, targets: %i[y z], multival: true, sep: ';' + # ``` + # + # Results in: + # + # ``` + # | x | y | z | + # |-----+-----+-----| + # | a | nil | b | + # | a | nil | nil | + # | a | b | c | + # | a;b | nil | c | + # | a | aa | bat | + # | nil | a | nil | + # | | a | b | + # | a | nil | nil | + # | a | A | nil | + # ``` + # + # Input table: + # + # ``` + # | x | y | z | + # |---+---+---| + # | a | A | a | + # | a | a | B | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Deduplicate::Fields, + # source: :x, + # targets: %i[y z], + # multival: true, + # sep: ';', + # casesensitive: false + # ``` + # + # Results in: + # + # ``` + # | x | y | z | + # |---+-----+-----| + # | a | nil | nil | + # | a | nil | B | + # ``` + # class Fields - def initialize(source:, targets:, casesensitive: true, multival: false, sep: DELIM) + # @param source [Symbol] name of field containing value to remove from target fields + # @param targets [Array] names of fields to remove source value(s) from + # @param casesensitive [Boolean] whether matching should be case sensitive + # @param multival [Boolean] whether to treat as multi-valued + # @param sep [String] used to split/join multi-val field values + def initialize(source:, targets:, casesensitive: true, multival: false, sep: Kiba::Extend.delim) @source = source @targets = targets @casesensitive = casesensitive @@ -53,7 +127,45 @@ def process(row) end end + # Removes duplicate values within the given field(s) + # + # Processes one field at a time. Splits value on sep, and keeps only the unique values + # + # @note This is NOT safe for use with groupings of fields whose multi-values are expected + # to be the same length + # + # Input table: + # + # ``` + # | foo | bar | + # |-------------------------| + # | 1;1;1;2;2;2 | a;A;b;b;b | + # | | q;r;r | + # | 1 | 2 | + # | 1 | 2 | + # ``` + # + # Used in pipeline as: + # + # ``` + # @deduper = {} + # transform Deduplicate::FieldValues, fields: %i[foo bar], sep: ';' + # ``` + # + # Results in: + # + # ``` + # | foo | bar | + # |-----------------| + # | 1;2 | a;A;b | + # | | q;r | + # | 1 | 2 | + # | 1 | 2 | + # ``` + # class FieldValues + # @param fields [Array] names of fields in which to deduplicate values + # @param sep [String] used to split/join multivalued field values def initialize(fields:, sep:) @fields = fields @sep = sep @@ -63,13 +175,57 @@ def initialize(fields:, sep:) def process(row) @fields.each do |field| val = row.fetch(field) - row[field] = val.split(@sep).uniq.join(@sep) unless val.nil? + row[field] = val.to_s.split(@sep).uniq.join(@sep) unless val.nil? end row end end + # Adds a field (`in_field`) containing 'y' or 'n', indicating whether value of `on_field` is a duplicate + # + # The first instance of a value in `on_field` is always marked `n`. Subsequent rows containing the same + # value will be marked 'y' + # + # Use this transform if you need to retain/report on what will be treated as a duplicate. Use + # {Kiba::Extend::Transforms::FilterRows::FieldEqualTo} to extract only the duplicate rows and/or to keep only the + # non-duplicate rows + # + # To delete duplicates all in one step, use {Kiba::Extend::Transforms::Deduplicate::Table} + # + # Input table: + # + # ``` + # | foo | bar | combined | + # |-----------------------| + # | a | b | a b | + # | c | d | c d | + # | c | e | c e | + # | c | d | c d | + # ``` + # + # Used in pipeline as: + # + # ``` + # @deduper = {} + # transform Deduplicate::Flag, on_field: :combined, in_field: :duplicate, using: @deduper + # ``` + # + # Results in: + # + # ``` + # | foo | bar | combined | duplicate | + # |----------------------------------| + # | a | b | a b | n | + # | c | d | c d | n | + # | c | e | c e | n | + # | c | d | c d | y | + # ``` + # class Flag + # @param on_field [Symbol] Field on which to deduplicate + # @param in_field [Symbol] New field in which to add 'y' or 'n' + # @param using [Hash] An empty Hash, set as an instance variable in your job definition before you + # use this transform def initialize(on_field:, in_field:, using:) @on = on_field @in = in_field @@ -89,7 +245,46 @@ def process(row) end end + # Field value deduplication that is at least semi-safe for use with grouped fields that expect the same number + # of values for each field in the grouping + # + # @note Tread with caution, as this has not been used much and is not extensively tested + # @todo Refactor this hideous mess + # + # + # Input table: + # + # ``` + # | name | work | role | + # |-----------------------+------------------------+----------------------------------------| + # | Fred;Freda;Fred;James | Report;Book;Paper;Book | author;photographer;editor;illustrator | + # | ; | ; | ; | + # | Martha | Book | contributor | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Deduplicate::GroupedFieldValues, + # on_field: :name, + # grouped_fields: %i[work role], + # sep: ';' + # ``` + # + # Results in: + # + # ``` + # | name | work | role | + # |------------------+------------------+---------------------------------| + # | Fred;Freda;James | Report;Book;Book | author;photographer;illustrator | + # | nil | nil | nil | + # | Martha | Book | contributor | + # ``` + # class GroupedFieldValues + # @param on_field [Symbol] the value to be deduplicated + # @param sep [String] used to split/join multivalued field values + # @param grouped_fields [Array] other fields in the same multi-field grouping as `field` def initialize(on_field:, sep:, grouped_fields: []) @field = on_field @other = grouped_fields @@ -146,6 +341,71 @@ def get_value_frequency(fv) h end end + + # Given a field on which to deduplicate, removes duplicate rows from table + # + # Keeps the row with the first instance of the value in the deduplicating field + # + # Tip: Use {Kiba::Extend::Transforms::CombineValues::FromFieldsWithDelimiter} or + # {Kiba::Extend::Transforms::CombineValues::FullRecord} to create a combined field on which to deduplicate + # + # @note This transform runs in memory, so for very large sources, it may take a long time or fail. In this + # case, use a combination of {Flag} and {Kiba::Extend::Transforms::FilterRows::FieldEqualTo} + # + # Input table: + # + # ``` + # | foo | bar | baz | combined | + # |-----------------------------| + # | a | b | f | a b | + # | c | d | g | c d | + # | c | e | h | c e | + # | c | d | i | c d | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Deduplicate::Table, fields: :combined, delete_field: true + # ``` + # + # Results in: + # + # ``` + # | foo | bar | baz | + # |-----------------| + # | a | b | f | + # | c | d | g | + # | c | e | h | + # ``` + # + class Table + # @param field [Symbol] name of field on which to deduplicate + # @param delete_field [Boolean] whether to delete the deduplication field after doing deduplication + def initialize(field:, delete_field: false) + @field = field + @deduper = {} + @delete = delete_field + end + + # @private + def process(row) + field_val = row.fetch(@field, nil) + return if field_val.blank? + return if @deduper.key?(field_val) + + @deduper[field_val] = row + nil + end + + # @private + def close + @deduper.values.each do |row| + row.delete(@field) if @delete + yield row + end + end + end end end end diff --git a/lib/kiba/extend/transforms/extract.rb b/lib/kiba/extend/transforms/extract.rb new file mode 100644 index 000000000..0ecab1ded --- /dev/null +++ b/lib/kiba/extend/transforms/extract.rb @@ -0,0 +1,101 @@ +# frozen_string_literal: true + +module Kiba + module Extend + module Transforms + # Transformations that extract specified data from a source + module Extract + ::Extract = Kiba::Extend::Transforms::Extract + + # Extracts the values of the given fields to a single `:value` column + # + # Inserts a `:from_field` column recording original field name for the value + # in each row. This can be turned off, resulting in a single-column result. + # + # Optionally, if given `:sep` value, splits multi-val fields to separate rows. + # + # @note This will collapse any source data to a one or two column result. It runs in-memory, + # so for very large sources, it may take a long time or fail + # + # Input table: + # + # ``` + # | foo | bar | baz | boo| + # |----------------------| + # | a:b | e | f | | + # | c | nil | g | h | + # | :d | i: | j | k | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Extract::Fields, fields: %i[foo bar], sep: ':' + # ``` + # + # Results in: + # + # ``` + # | value | from_field | + # |--------------------| + # | a | foo | + # | b | foo | + # | e | bar | + # | c | foo | + # | d | foo | + # | i | bar | + # ``` + # + # Used in pipeline as: + # + # ``` + # transform Extract::Fields, fields: %i[foo bar], source_field_track: false + # ``` + # + # Results in: + # + # ``` + # | value | + # |-------- + # | a:b | + # | e | + # | c | + # | :d | + # | i: | + # ``` + class Fields + def initialize(fields:, sep: nil, source_field_track: true) + @fields = fields + @sep = sep + @track = source_field_track + @rows = [] + end + + def process(row) + @fields.each{ |field| extract_field_value(row, field) } + nil + end + + def close + @rows.each{ |row| yield row } + end + + private + + def extract_field_value(row, field) + field_val = row.fetch(field, nil) + return if field_val.blank? + + vals = @sep ? field_val.split(@sep) : [field_val] + vals.each do |val| + next if val.blank? + + new_row = @track ? { value: val, from_field: field } : { value: val } + @rows << new_row + end + end + end + end + end + end +end diff --git a/lib/kiba/extend/transforms/helpers.rb b/lib/kiba/extend/transforms/helpers.rb index 43aa2b279..1d3388dbd 100644 --- a/lib/kiba/extend/transforms/helpers.rb +++ b/lib/kiba/extend/transforms/helpers.rb @@ -5,7 +5,7 @@ module Extend module Transforms # utility functions across Transforms module Helpers - extend self + module_function # Indicates whether a field value is delimiter-only. If `usenull` is set to true, the # %NULLVALUE% string is treated as empty in detecting delimiter-only-ness # @param val [String] The field value to check @@ -33,9 +33,9 @@ def field_values(row:, fields:, discard: %i[nil empty delim], delim: DELIM, usen field_vals = fields.map { |field| [field, row.fetch(field, nil)] }.to_h return field_vals if discard.blank? - field_vals = field_vals.reject { |field, val| val.nil? } if discard.any?(:nil) + field_vals = field_vals.reject { |_field, val| val.nil? } if discard.any?(:nil) keep = keep_fields(field_vals, discard, delim, usenull) - field_vals.select { |field, val| keep.any?(field) } + field_vals.select { |field, _val| keep.any?(field) } end # @param field_vals [Hash{Symbol=>String,Nil}l] A subset of a row @@ -46,8 +46,8 @@ def field_values(row:, fields:, discard: %i[nil empty delim], delim: DELIM, usen # and usenull param values and the field values private_class_method def keep_fields(field_vals, discard, delim, usenull) field_vals = field_vals.transform_values { |val| val.gsub('%NULLVALUE%', '') } if usenull - field_vals = field_vals.reject { |field, val| val.empty? } if discard.any?(:empty) - field_vals = field_vals.reject { |field, val| delim_only?(val, delim) } if discard.any?(:delim) + field_vals = field_vals.reject { |_field, val| val.empty? } if discard.any?(:empty) + field_vals = field_vals.reject { |_field, val| delim_only?(val, delim) } if discard.any?(:delim) field_vals.keys end end diff --git a/lib/kiba/extend/transforms/split.rb b/lib/kiba/extend/transforms/split.rb index 3ab01acef..686ed04ee 100644 --- a/lib/kiba/extend/transforms/split.rb +++ b/lib/kiba/extend/transforms/split.rb @@ -106,7 +106,7 @@ def initialize(field:, sep:, delete_source: true, max_segments:, collapse_on: :r @collapse_on = collapse_on @warn = !warnfield.blank? @warnfield = warnfield ||= :warning - @new_fields = ( 0..( @max - 1 ) ).entries.map { |entry| "#{field}#{entry}".to_sym } + @new_fields = (0..(@max - 1)).entries.map { |entry| "#{field}#{entry}".to_sym } end # rubocop:enable Metrics/ParameterLists @@ -156,13 +156,13 @@ def exceeds_max?(valsplit) end def process_exceeding(valsplit, row) - if @collapse_on == :right - process_right_split(valsplit, row) - end + return unless @collapse_on == :right + + process_right_split(valsplit, row) end def process_right_collapse(valsplit, row) - valsplit.slice!(0..( diff(valsplit) - 1 )).each_with_index do |val, i| + valsplit.slice!(0..(diff(valsplit) - 1)).each_with_index do |val, i| row["#{@field}#{i}".to_sym] = val end row["#{@field}#{@max - 1}".to_sym] = valsplit.join(@sep) diff --git a/lib/kiba/extend/utils/lookup.rb b/lib/kiba/extend/utils/lookup.rb index bd8c30800..c259541e8 100644 --- a/lib/kiba/extend/utils/lookup.rb +++ b/lib/kiba/extend/utils/lookup.rb @@ -5,9 +5,10 @@ module Extend module Utils module Lookup ::Lookup = Kiba::Extend::Utils::Lookup + extend self # use when keycolumn values are unique # creates hash with keycolumn value as key and csv-row-as-hash as the value - def self.csv_to_hash(file:, keycolumn:, csvopt: {}) + def csv_to_hash_deprecated(file:, keycolumn:, csvopt: {}) CSV.foreach(File.expand_path(file), csvopt).each_with_object({}) do |r, memo| memo[r.fetch(keycolumn, nil)] = r.to_h end @@ -15,9 +16,9 @@ def self.csv_to_hash(file:, keycolumn:, csvopt: {}) # use when keycolumn values are not unique # creates hash with keycolumn value as key and array of csv-rows-as-hashes as the value - def self.csv_to_multi_hash(file:, keycolumn:, csvopt: {}) - CSV.foreach(File.expand_path(file), csvopt).each_with_object({}) do |r, memo| - k = r.fetch(keycolumn, nil) + def csv_to_hash(**args) + CSV.foreach(File.expand_path(args[:file]), args[:csvopt]).each_with_object({}) do |r, memo| + k = r.fetch(args[:keycolumn], nil) if memo.key?(k) memo[k] << r.to_h else @@ -26,6 +27,8 @@ def self.csv_to_multi_hash(file:, keycolumn:, csvopt: {}) end end + alias csv_to_multi_hash csv_to_hash + class SetChecker attr_reader :set_type, :result diff --git a/lib/kiba/extend/version.rb b/lib/kiba/extend/version.rb index ee40bae27..d945aee50 100644 --- a/lib/kiba/extend/version.rb +++ b/lib/kiba/extend/version.rb @@ -2,6 +2,6 @@ module Kiba module Extend - VERSION = '2.1.1' + VERSION = '2.2.0' end end diff --git a/spec/fixtures/base_job_base.csv b/spec/fixtures/base_job_base.csv new file mode 100644 index 000000000..aef763abd --- /dev/null +++ b/spec/fixtures/base_job_base.csv @@ -0,0 +1,3 @@ +letter,number +a,1 +b,2 diff --git a/spec/fixtures/base_job_lookup.csv b/spec/fixtures/base_job_lookup.csv new file mode 100644 index 000000000..96aebac60 --- /dev/null +++ b/spec/fixtures/base_job_lookup.csv @@ -0,0 +1,3 @@ +letter,word +a,aardvark +b,bird diff --git a/spec/fixtures/existing.csv b/spec/fixtures/existing.csv new file mode 100644 index 000000000..1d3d76ee6 --- /dev/null +++ b/spec/fixtures/existing.csv @@ -0,0 +1,3 @@ +objectNumber,numberOfObjects,briefDescription,distinguishingFeatures,comment +OBJ1,1,"À la base, une sorte d’aquarium avait été installé tout autour.",je suis là. » Dit Prélude.,Les explications continuèrent pendant une bonne heure. David laissait parler Prélude. +OBJ2,2,"Il sort de son lit, les yeux dans un brouillard londonien, avance jusqu'à la salle de bain dont la baignoire a été remplie cinq minutes avant par l'ordinateur de la maison, et va directement prendre un bain.",Il s’est alors ‘transporté’ sur Internet afin de choisir une nouvelle ‘maison’.,C’est alors qu’il a décidé d’aller lui même à l’information. \ No newline at end of file diff --git a/spec/helpers.rb b/spec/helpers.rb index b61dcdd64..a5e24bb69 100644 --- a/spec/helpers.rb +++ b/spec/helpers.rb @@ -1,15 +1,183 @@ # frozen_string_literal: true module Helpers + module_function + + # Format examples for Yard documentation + # + # Use: Helpers::ExampleFormatter.new(input, expected) + class ExampleFormatter + def initialize(*args) + args.each do |arg| + @clean = nil + @norm = nil + @table = nil + @headers = nil + @maxes = {} + @data = arg + build_table + put_table + end + end + + private + + def build_table + clean_data + populate_maxes + normalize + format_table + end + + def clean_data + @clean = [] + @data.each do |row| + @clean << row.transform_values{ |val| val.nil? ? 'nil' : val } + end + end + + def format_table + @table = [] + @table << headers + @table << grab_divider + grab_rows + end + + def headers + @headers ||= @norm.first.keys + end + + def grab_divider + div = [] + headers.each do |header| + segment = '-' * header.length + div << segment + end + div + end + + def grab_row(row) + t_row = [] + headers.each do |header| + t_row << row[header] + end + t_row + end + + def grab_rows + @norm.each{ |row| @table << grab_row(row) } + end + + def max_val_length_for_header(header) + @clean.map{ |row| row[header].length }.max + end + + def normalize + @norm = [] + @clean.each{ |row| @norm << normalize_row(row) } + end + + def normalize_row(row) + norm = {} + row.each do |header, val| + max = @maxes[header] + norm[header.to_s.ljust(max)] = val.ljust(max) + end + norm + end + + def populate_maxes + @maxes = @clean.first.map{ |e| [e[0], e[0].length] }.to_h + headers = @maxes.keys + headers.each do |hdr| + val_max = max_val_length_for_header(hdr) + @maxes[hdr] = val_max if val_max > @maxes[hdr] + end + end + + def put_row(row) + puts "# | #{row.join(' | ')} |" + end + + def put_table + table = @table.dup + puts '' + puts '#' + puts '# ```' + put_row(table.shift) + div = table.shift + puts "# |-#{div.join('-+-')}-|" + table.each{ |row| put_row(row) } + puts '# ```' + puts '#' + end + end + + + + def fixtures_dir + app_dir = File.realpath(File.join(File.dirname(__FILE__), '..')) + File.join(app_dir, 'spec', 'fixtures') + end + + def populate_registry + fkeypath = File.join(fixtures_dir, 'fkey.csv') + entries = { fkey: { path: fkeypath, supplied: true, lookup_on: :id }, + invalid: {}, + fee: { path: fkeypath, lookup_on: :foo, supplied: true }, + foo: { path: fkeypath, creator: Helpers.method(:test_csv), tags: %i[test] }, + bar: { path: fkeypath, creator: Helpers.method(:lookup_csv), tags: %i[test report] }, + baz: { path: fkeypath, creator: Kiba::Extend::Utils::Lookup.method(:csv_to_hash), tags: %i[report] }, + warn: { path: fkeypath, dest_class: Kiba::Common::Destinations::CSV, + creator: Kiba::Extend.method(:csvopts), + dest_special_opts: { initial_headers: %i[objectnumber briefdescription] } } } + entries.each { |key, data| Kiba::Extend.registry.register(key, data) } + Kiba::Extend.registry.namespace(:ns) do + namespace(:sub) do + register(:fkey, { path: 'data', supplied: true }) + end + end + end + + def transform_registry + Kiba::Extend.registry.transform + end + + def prepare_registry + populate_registry + transform_registry + end + + def fake_creator_method + FileUtils.touch(File.join(fixtures_dir, 'base_job_missing.csv')) + end + + # for test in Kiba::Extend::Jobs::BaseJobsSpec that I can't get working + # class BaseJob + # include Kiba::Extend::Jobs::Runner + + # attr_reader :files + # def initialize(files:) + # @files = setup_files(files) + # end + + # def creator=(arg) + # @creator = arg + # end + + # def creator + # @creator + # end + # end + def test_csv File.join(File.expand_path(__dir__), 'tmp', 'test.csv') end + def lookup_csv File.join(File.expand_path(__dir__), 'tmp', 'lkup.csv') end - - def generate_csv(rows) CSV.open(test_csv, 'w') do |csv| rows.each { |row| csv << row } diff --git a/spec/kiba/extend/jobs/job_spec.rb b/spec/kiba/extend/jobs/job_spec.rb new file mode 100644 index 000000000..323b9b32a --- /dev/null +++ b/spec/kiba/extend/jobs/job_spec.rb @@ -0,0 +1,71 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::Jobs::Job' do + before(:context) do + @dest_file = File.join(fixtures_dir, 'base_job_dest.csv') + Kiba::Extend.config.registry = Kiba::Extend::FileRegistry.new + entries = { base_src: { path: File.join(fixtures_dir, 'base_job_base.csv'), supplied: true }, + base_lookup: { path: File.join(fixtures_dir, 'base_job_lookup.csv'), supplied: true, + lookup_on: :letter }, + base_dest: { path: @dest_file, creator: Helpers.method(:fake_creator_method) }, } + entries.each { |key, data| Kiba::Extend.registry.register(key, data) } + transform_registry + end + before(:each) do + FileUtils.rm(@dest_file) if File.exist?(@dest_file) + end + after(:each) do + FileUtils.rm(@dest_file) if File.exist?(@dest_file) + end + + let(:base_job) { Kiba::Extend::Jobs::Job.new(files: base_job_config, transformer: base_job_transforms) } + let(:base_job_config) { { source: [:base_src], destination: ['base_dest'], lookup: [:base_lookup] } } + let(:base_job_transforms) do + Kiba.job_segment do + transform Kiba::Extend::Transforms::Rename::Field, from: :letter, to: :alpha + transform Merge::MultiRowLookup, + lookup: base_lookup, + keycolumn: :alpha, + fieldmap: { + from_lkup: :word, + }, + delim: Kiba::Extend.delim + end + end + + context 'with defaults' do + let(:job) { base_job } + context 'when dependency files exist' do + it 'runs and produces expected result' do + job + result = CSV.read(@dest_file) + expected = [['number', 'alpha', 'from_lkup'], ['1', 'a', 'aardvark'], ['2', 'b', 'bird']] + expect(result).to eq(expected) + end + end + + context 'when dependency files do not exist' do + let(:base_job_config) { { source: [:missing_src], destination: [:base_dest], lookup: [:base_lookup] } } + # cannot figure out how to test this in a timely manner. Will test manually for now. + xit 'calls dependency creators' do + missing_file = File.join(fixtures_dir, 'base_job_missing.csv') + creator = double() + Kiba::Extend.config.registry = Kiba::Extend::FileRegistry.new + entries = { base_lookup: { path: File.join(fixtures_dir, 'base_job_lookup.csv'), supplied: true, lookup_on: :letter }, + base_dest: { path: @dest_file, creator: Helpers.method(:fake_creator_method) }, + missing_src: { path: missing_file, creator: Helpers::BaseJob.method(:creator) } } + entries.each { |key, data| Kiba::Extend.registry.register(key, data) } + transform_registry + testjob = Helpers::BaseJob.new(files: base_job_config) + testjob.creator = creator + expect(creator).to receive(:call) + testjob.handle_requirements + end + end + # raise_error(Kiba::Extend::Jobs::Runner::MissingDependencyError) + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/jobs/testing_job_spec.rb b/spec/kiba/extend/jobs/testing_job_spec.rb new file mode 100644 index 000000000..8614a7e36 --- /dev/null +++ b/spec/kiba/extend/jobs/testing_job_spec.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::Jobs::TestingJob' do + let(:test_job) { Kiba::Extend::Jobs::TestingJob.new(files: test_job_config, transformer: test_job_transforms) } + let(:test_job_config){ { source: src, destination: dest } } + let(:src){ [{foo: 1, bar: 2}, {foo: 3, bar: 4}] } + let(:dest){ [] } + let(:test_job_transforms) do + Kiba.job_segment do + transform Kiba::Extend::Transforms::Rename::Field, from: :bar, to: :baz + # transform Merge::MultiRowLookup, + # lookup: base_lookup, + # keycolumn: :alpha, + # fieldmap: { + # from_lkup: :word, + # }, + # delim: Kiba::Extend.delim + end + end + + context 'with defaults' do + let(:job) { test_job } + context 'when dependency files exist' do + it 'runs and produces expected result' do + job + expected = [{foo: 1, baz: 2}, {foo: 3, baz: 4}] + expect(dest).to eq(expected) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/file_registry_entry_spec.rb b/spec/kiba/extend/registry/file_registry_entry_spec.rb new file mode 100644 index 000000000..da8ae3381 --- /dev/null +++ b/spec/kiba/extend/registry/file_registry_entry_spec.rb @@ -0,0 +1,82 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::FileRegistryEntry' do + let(:path) { File.join('spec', 'fixtures', 'fkey.csv') } + let(:entry) { Kiba::Extend::FileRegistryEntry.new(data) } + let(:reghash) do + { + fkey: { path: path, key: :foo }, + foo: { path: path, creator: Helpers.method(:test_csv), tags: %i[test] }, + bar: { path: path, creator: Helpers.method(:lookup_csv), tags: %i[test report] }, + baz: { path: path, creator: Kiba::Extend::Utils::Lookup.method(:csv_to_hash), tags: %i[report] }, + namespace: { + foo: { path: path, creator: Helpers.method(:test_csv), tags: %i[test] }, + sub: { + fkey: { path: path }, + baz: { path: path, creator: Helpers.method(:test_csv), tags: %i[report] }, + } + } + } + end + + context 'with valid data' do + let(:data) { { path: path, creator: Helpers.method(:test_csv) } } + it 'valid as expected' do + expect(entry.path).to eq(Pathname.new(path)) + expect(entry.valid?).to be true + end + end + + context 'without path' do + context 'when CSV source/dest' do + let(:data) { { pat: path, supplied: true } } + it 'invalid as expected' do + expect(entry.path).to be_nil + expect(entry.valid?).to be false + expect(entry.errors.key?(:missing_path)).to be true + end + end + + context 'when un-written source/dest' do + let(:data) { + { src_class: Kiba::Common::Sources::Enumerable, + dest_class: Kiba::Common::Destinations::Lambda, + supplied: true } + } + it 'valid as expected' do + expect(entry.path).to be_nil + expect(entry.valid?).to be true + end + end + end + + context 'without creator' do + context 'when supplied file' do + let(:data) { { path: path, supplied: true } } + it 'valid' do + expect(entry.valid?).to be true + end + end + + context 'when not a supplied file' do + let(:data) { { path: path } } + it 'invalid as expected' do + expect(entry.valid?).to be false + expect(entry.errors[:missing_creator_for_non_supplied_file]).to be_nil + end + end + end + + context 'with non-method creator' do + let(:data) { { path: path, creator: 'a string' } } + it 'invalid as expected' do + expect(entry.creator).to be_nil + expect(entry.valid?).to be false + expect(entry.errors[:creator_not_a_method]).to eq('a string') + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/file_registry_spec.rb b/spec/kiba/extend/registry/file_registry_spec.rb new file mode 100644 index 000000000..96e1156d4 --- /dev/null +++ b/spec/kiba/extend/registry/file_registry_spec.rb @@ -0,0 +1,82 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::FileRegistry' do + before(:context) do + Kiba::Extend.config.registry = Kiba::Extend::FileRegistry.new + populate_registry + end + let(:filekey) { :fkey } + let(:fkeypath) { File.join(fixtures_dir, 'fkey.csv') } + let(:registry) { Kiba::Extend.registry } + let(:result) { registry.resolve(filekey) } + + describe 'initial setup and registration' do + context 'when no namespace' do + let(:data) { { path: fkeypath, supplied: true, lookup_on: :id } } + it 'registers and resolves' do + expect(result).to eq(data) + end + + context 'with insufficient data' do + let(:filekey) { :invalid } + let(:data) { {} } + it 'registers and resolves' do + expect(result).to eq(data) + end + end + end + + context 'with namespace' do + it 'registers and resolves' do + expect(registry.resolve('ns__sub__fkey')).to eq({ path: 'data', supplied: true }) + end + end + end + + # subsequent tests depend on the transformation having been done here + describe 'post-transformation' do + before(:context) { transform_registry } + describe '#transform' do + it 'converts all registered items to FileRegistryEntry objects' do + chk = [] + registry.each { |item| chk << item[1].class } + chk.uniq! + expect(chk.length).to eq(1) + expect(chk.first).to eq(Kiba::Extend::FileRegistryEntry) + end + end + + describe 'as destination' do + let(:result) { registry.as_destination(filekey) } + it 'returns destination file config' do + expect(result).to be_a(Kiba::Extend::RegisteredDestination) + end + + context 'when called with nonexistent key' do + let(:filekey) { :cats } + it 'raises error' do + msg = "No file registered under the key: :#{filekey}" + expect { result }.to raise_error(Kiba::Extend::FileRegistry::KeyNotRegisteredError, msg) + end + end + end + + describe 'as lookup' do + let(:result) { registry.as_lookup(filekey) } + it 'returns lookup file config' do + expect(result).to be_a(Kiba::Extend::RegisteredLookup) + end + end + + describe 'as source' do + let(:result) { registry.as_source(filekey) } + it 'returns source file config' do + expect(result).to be_a(Kiba::Extend::RegisteredSource) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/registered_destination_spec.rb b/spec/kiba/extend/registry/registered_destination_spec.rb new file mode 100644 index 000000000..48bace00e --- /dev/null +++ b/spec/kiba/extend/registry/registered_destination_spec.rb @@ -0,0 +1,100 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RegisteredDestination' do + let(:filekey) { :fkey } + let(:path) { File.join('spec', 'fixtures', 'fkey.csv') } + let(:default) { { path: path } } + let(:default_desc) { { path: path, desc: 'description' } } + let(:dest) { Kiba::Extend::RegisteredDestination.new(key: filekey, data: Kiba::Extend::FileRegistryEntry.new(data)) } + let(:optres) { { csv_options: Kiba::Extend.csvopts } } + describe '#args' do + let(:result) { dest.args } + context 'with basic defaults' do + let(:data) { default } + let(:expected) do + [{ filename: path }.merge(optres)] + end + it 'returns with Kiba::Extend csvopts' do + expect(result).to eq(expected) + end + end + + context 'with given options' do + let(:override_opts) { { foo: :bar } } + let(:data) { { path: path, dest_opt: override_opts } } + let(:expected) do + [{ filename: path, csv_options: override_opts }] + end + it 'returns with given opts' do + expect(result).to eq(expected) + end + end + + context 'with extra options' do + context 'when extra option is allowed for destination class' do + let(:extra) { { initial_headers: %i[a b] } } + let(:data) { { path: path, dest_class: Kiba::Extend::Destinations::CSV, dest_special_opts: extra } } + let(:expected) do + [{ filename: path, csv_options: Kiba::Extend.csvopts, initial_headers: %i[a b] }] + end + it 'returns with extra options' do + expect(result).to eq(expected) + end + end + + context 'when extra option is not defined for destination class' do + let(:extra) { { blah: %i[a b] } } + let(:data) { { path: path, dest_class: Kiba::Extend::Destinations::CSV, dest_special_opts: extra } } + let(:expected) do + [{ filename: path, csv_options: Kiba::Extend.csvopts }] + end + it 'returns without extra options' do + expect(result).to eq(expected) + end + it 'warns about unsupported options' do + msg = "WARNING: Destination file :#{filekey} is called with special option :blah, which is unsupported by Kiba::Extend::Destinations::CSV\n" + expect { dest.args }.to output(msg).to_stdout + end + end + end + end + + describe '#description' do + let(:result) { dest.description } + context 'when not given' do + let(:data) { default } + it 'returns empty string' do + expect(result).to eq('') + end + end + + context 'when given' do + let(:data) { default_desc } + it 'returns given value' do + expect(result).to eq('description') + end + end + end + + describe '#klass' do + let(:result) { dest.klass } + context 'with basic defaults' do + let(:data) { default } + it 'returns Kiba::Extend default destination class' do + expect(result).to eq(Kiba::Extend.destination) + end + end + + context 'with a given class' do + let(:override_klass) { Kiba::Common::Sources::CSV } + let(:data) { { path: path, dest_class: override_klass } } + it 'returns given class' do + expect(result).to eq(override_klass) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/registered_file_spec.rb b/spec/kiba/extend/registry/registered_file_spec.rb new file mode 100644 index 000000000..7cef79462 --- /dev/null +++ b/spec/kiba/extend/registry/registered_file_spec.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RegisteredFile' do + let(:filekey) { :fkey } + let(:path) { File.join('spec', 'fixtures', 'fkey.csv') } + let(:default) { { path: path } } + let(:dest) { Kiba::Extend::RegisteredFile.new(key: filekey, data: Kiba::Extend::FileRegistryEntry.new(data)) } + + context 'when called with no path' do + let(:data) { { description: 'blah' } } + it 'raises FileNotRegisteredError' do + msg = "No file path for :#{filekey} is recorded in file registry hash" + expect { + Kiba::Extend::RegisteredFile.new(key: filekey, + data: Kiba::Extend::FileRegistryEntry.new(data)) + }.to raise_error( + Kiba::Extend::RegisteredFile::NoFilePathError, msg + ) + end + end + + describe '#key' do + let(:result) { dest.key } + context 'with basic defaults' do + let(:data) { default } + it 'returns file key' do + expect(result).to eq(filekey) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/registered_lookup_spec.rb b/spec/kiba/extend/registry/registered_lookup_spec.rb new file mode 100644 index 000000000..81c35d9b2 --- /dev/null +++ b/spec/kiba/extend/registry/registered_lookup_spec.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RegisteredLookup' do + let(:filekey) { :fkey } + let(:path) { File.join('spec', 'fixtures', 'fkey.csv') } + let(:key) { :foo } + let(:default) { { path: path, lookup_on: key, creator: Helpers.method(:test_csv) } } + let(:lookup) { Kiba::Extend::RegisteredLookup.new(key: filekey, data: Kiba::Extend::FileRegistryEntry.new(data)) } + + context 'when called without lookup key' do + let(:data) { { path: path } } + it 'raises NoLookupKeyError' do + msg = "No lookup key column found for :#{filekey} in file registry hash" + expect { lookup }.to raise_error(Kiba::Extend::RegisteredLookup::NoLookupKeyError, msg) + end + end + + describe '#args' do + let(:result) { lookup.args } + context 'with basic defaults' do + let(:data) { default } + let(:expected) do + { file: path, csvopt: Kiba::Extend.csvopts, keycolumn: key } + end + it 'returns with default csvopts' do + expect(result).to eq(expected) + end + end + + context 'with given options' do + let(:override_opts) { { foo: :bar } } + let(:data) { default.merge({ src_opt: override_opts }) } + let(:expected) do + { file: path, csvopt: override_opts, keycolumn: key } + end + it 'returns with given options' do + expect(result).to eq(expected) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/registered_source_spec.rb b/spec/kiba/extend/registry/registered_source_spec.rb new file mode 100644 index 000000000..d847c830a --- /dev/null +++ b/spec/kiba/extend/registry/registered_source_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RegisteredSource' do + let(:filekey) { :fkey } + let(:path) { File.join('spec', 'fixtures', 'fkey.csv') } + let(:default) { { path: path, creator: -> { Helpers.test_csv } } } + let(:source) { Kiba::Extend::RegisteredSource.new(key: filekey, data: Kiba::Extend::FileRegistryEntry.new(data)) } + + describe '#args' do + let(:result) { source.args } + context 'with basic defaults' do + let(:data) { default } + let(:expected) do + [{ filename: path, csv_options: Kiba::Extend.csvopts }] + end + it 'returns with Kiba::Extend default csvopts' do + expect(result).to eq(expected) + end + end + + context 'with given options' do + let(:override_opts) { { foo: :bar } } + let(:data) { { path: path, src_opt: override_opts } } + let(:expected) do + [{ filename: path, csv_options: override_opts }] + end + it 'returns with given opts' do + expect(result).to eq(expected) + end + end + end + + describe '#klass' do + let(:result) { source.klass } + context 'with basic defaults' do + let(:data) { default } + it 'returns Kiba::Extend default source class' do + expect(result).to eq(Kiba::Extend.source) + end + end + + context 'with a given class' do + let(:override_klass) { Kiba::Common::Destinations::CSV } + let(:data) { { path: path, src_class: override_klass } } + it 'returns given class' do + expect(result).to eq(override_klass) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/registry_entry_selector_spec.rb b/spec/kiba/extend/registry/registry_entry_selector_spec.rb new file mode 100644 index 000000000..2745d0a63 --- /dev/null +++ b/spec/kiba/extend/registry/registry_entry_selector_spec.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RegistryEntrySelector' do + before(:context) do + Kiba::Extend.config.registry = Kiba::Extend::FileRegistry.new + prepare_registry + end + let(:selector) { Kiba::Extend::RegistryEntrySelector.new } + + describe '#tagged_any' do + let(:result) { selector.tagged_any(tags) } + context 'with :test, :report' do + let(:tags) { %w[test report] } + it 'returns entries tagged with given symbol' do + expect(result.length).to eq(3) + expect(result.map(&:key).sort).to eq(%w[bar baz foo]) + end + end + end + + describe '#tagged_all' do + let(:result) { selector.tagged_all(tags) } + context 'with :test, :report' do + let(:tags) { %w[test report] } + it 'returns entries tagged with given symbols' do + expect(result.length).to eq(1) + expect(result.map(&:key).sort).to eq(%w[bar]) + end + end + end + + describe '#created_by_class' do + let(:result) { selector.created_by_class(cstr) } + context 'with Kiba::Extend::Utils::Lookup' do + let(:cstr) { 'Kiba::Extend::Utils::Lookup' } + it 'returns entries created by given class or method' do + expect(result.length).to eq(1) + expect(result.map(&:key).sort).to eq(%w[baz]) + end + end + + context 'with Kiba::Extend' do + let(:cstr) { 'Kiba::Extend' } + it 'does not require full string match' do + expect(result.length).to eq(1) + expect(result.map(&:key).sort).to eq(%w[baz]) + end + end + + context 'with Helpers' do + let(:cstr) { 'Helpers' } + it 'returns entries created by given class or method' do + expect(result.length).to eq(2) + expect(result.map(&:key).sort).to eq(%w[bar foo]) + end + end + end + + describe '#created_by_method' do + let(:result) { selector.created_by_method(mstr) } + context 'with Kiba::Extend::Utils::Lookup.csv_to_hash' do + let(:mstr) { 'Kiba::Extend::Utils::Lookup.csv_to_hash' } + it 'returns entries created by given method' do + expect(result.length).to eq(1) + expect(result.map(&:key).sort).to eq(%w[baz]) + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/registry_validator_spec.rb b/spec/kiba/extend/registry/registry_validator_spec.rb new file mode 100644 index 000000000..52c29b210 --- /dev/null +++ b/spec/kiba/extend/registry/registry_validator_spec.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RegistryValidator' do + before(:context) do + Kiba::Extend.config.registry = Kiba::Extend::FileRegistry.new + prepare_registry + end + let(:validator) { Kiba::Extend::RegistryValidator.new } + + describe '#valid?' do + let(:result) { validator.valid? } + it 'reports invalid entries' do + expect(result).to be false + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/registry/requirable_file_spec.rb b/spec/kiba/extend/registry/requirable_file_spec.rb new file mode 100644 index 000000000..1eacf1a74 --- /dev/null +++ b/spec/kiba/extend/registry/requirable_file_spec.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +require 'spec_helper' + +class TestClass < Kiba::Extend::RegisteredFile + include Kiba::Extend::RequirableFile +end + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::RequirableFile' do + let(:filekey) { :fkey } + let(:path) { File.join('spec', 'fixtures', 'fkey.csv') } + let(:default) { { path: path, creator: Helpers.method(:fake_creator_method) } } + let(:klass) { TestClass.new(key: filekey, data: Kiba::Extend::FileRegistryEntry.new(data)) } + + context 'when called without creator' do + let(:data) { { path: path } } + it 'raises NoDependencyCreatorError' do + msg = "No creator method found for :#{filekey} in file registry" + expect { + TestClass.new(key: filekey, + data: Kiba::Extend::FileRegistryEntry.new(data)).required + }.to raise_error( + Kiba::Extend::RequirableFile::NoDependencyCreatorError, msg + ) + end + end + + describe '#required' do + let(:result) { klass.required } + let(:data) { default } + context 'when file does not exist at path' do + it 'returns creator Method' do + expect(result).to eq(Helpers.method(:fake_creator_method)) + end + end + + context 'when file exists at path' do + let(:path) { File.join(fixtures_dir, 'base_job_base.csv') } + it 'returns nil' do + expect(result).to be nil + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/sources/file_set_spec.rb b/spec/kiba/extend/sources/file_set_spec.rb new file mode 100644 index 000000000..9f19a8934 --- /dev/null +++ b/spec/kiba/extend/sources/file_set_spec.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +require 'spec_helper' + +# rubocop:disable Metrics/BlockLength +RSpec.describe 'Kiba::Extend::Sources::FileSet' do + before(:context) do + @path = File.join(fixtures_dir, 'fileset') + FileUtils.mkdir(@path) + FileUtils.touch(File.join(@path, 'a.csv')) + FileUtils.touch(File.join(@path, 'b.csv')) + FileUtils.touch(File.join(@path, 't.txt')) + FileUtils.touch(File.join(@path, '.~lock.a.csv')) + end + after(:context) { FileUtils.rm_rf(@path) } + + let(:args) { { path: @path } } + let(:set) { Kiba::Extend::Sources::FileSet.new(args) } + describe '#files' do + let(:result) { set.files } + context 'with defaults' do + it 'returns expected files' do + expect(result.length).to eq(4) + end + end + + context 'with include' do + let(:args) { { path: @path, include: '.*\.csv$' } } + it 'returns expected files' do + expect(result.length).to eq(3) + end + end + + context 'with exclude' do + let(:args) { { path: @path, exclude: '^\.~lock' } } + it 'returns expected files' do + expect(result.length).to eq(3) + end + end + + context 'with include and exclude' do + let(:args) { { path: @path, include: '.*\.csv$', exclude: '^\.~lock' } } + it 'returns expected files' do + expect(result.length).to eq(2) + end + end + + context 'when recursive' do + before(:context) do + subdir = File.join(@path, 'subdir') + FileUtils.mkdir(subdir) + FileUtils.touch(File.join(subdir, 'd.csv')) + FileUtils.touch(File.join(subdir, 'e.csv')) + FileUtils.touch(File.join(subdir, 'f.xml')) + FileUtils.touch(File.join(subdir, '.~lock.d.csv')) + end + context 'with include and exclude' do + let(:args) { { path: @path, recursive: true, include: '.*\.csv$', exclude: '^\.~lock' } } + it 'returns expected files' do + expect(result.length).to eq(4) + end + end + end + end +end +# rubocop:enable Metrics/BlockLength diff --git a/spec/kiba/extend/transforms/append_spec.rb b/spec/kiba/extend/transforms/append_spec.rb index f487f53fe..321816526 100644 --- a/spec/kiba/extend/transforms/append_spec.rb +++ b/spec/kiba/extend/transforms/append_spec.rb @@ -3,7 +3,6 @@ require 'spec_helper' RSpec.describe Kiba::Extend::Transforms::Append do - before { generate_csv(rows) } describe 'NilFields' do diff --git a/spec/kiba/extend/transforms/clean_spec.rb b/spec/kiba/extend/transforms/clean_spec.rb index a89bef25d..fe704ce7d 100644 --- a/spec/kiba/extend/transforms/clean_spec.rb +++ b/spec/kiba/extend/transforms/clean_spec.rb @@ -4,9 +4,9 @@ RSpec.describe Kiba::Extend::Transforms::Clean do describe 'AlphabetizeFieldValues' do -# test_csv = File.join(__dir__, 'tmp', 'test.csv') -# binding.pry -# + # test_csv = File.join(__dir__, 'tmp', 'test.csv') + # binding.pry + # rows = [ %w[type], ['Person;unmapped;Organization'], @@ -120,7 +120,6 @@ end describe 'DelimiterOnlyFields' do - let(:rows) do [ %w[id in_set], @@ -184,7 +183,6 @@ end describe 'EmptyFieldGroups' do - let(:rows) do [ %w[id a1 a2 b1 b2 b3], diff --git a/spec/kiba/extend/transforms/deduplicate_spec.rb b/spec/kiba/extend/transforms/deduplicate_spec.rb index 1ba5fa0c5..6759f8dce 100644 --- a/spec/kiba/extend/transforms/deduplicate_spec.rb +++ b/spec/kiba/extend/transforms/deduplicate_spec.rb @@ -3,23 +3,28 @@ require 'spec_helper' RSpec.describe Kiba::Extend::Transforms::Deduplicate do + let(:test_job_config){ { source: input, destination: output } } + let(:test_job) { Kiba::Extend::Jobs::TestingJob.new(files: test_job_config, transformer: test_job_transforms) } + let(:output){ [] } + describe 'Fields' do context 'when casesensitive = true' do - it 'removes value(s) of source field from target field(s)' do - rows = [ - %w[x y z], - %w[a a b], - ['a', 'a ', 'a'], - ['a', 'b;a', 'a;c'], - ['a;b', 'b;a', 'a;c'], - %w[a aa bat], - [nil, 'a', nil], - ['', ' ;a', 'b;'], - ['a', nil, nil], - %w[a A a] + let(:input) do + [ + {x: 'a', y: 'a', z: 'b'}, + {x: 'a', y: 'a', z: 'a'}, + {x: 'a', y: 'b;a', z: 'a;c'}, + {x: 'a;b', y: 'b;a', z: 'a;c'}, + {x: 'a', y: 'aa', z: 'bat'}, + {x: nil, y: 'a', z: nil}, + {x: '', y: ';a', z: 'b;'}, + {x: 'a', y: nil, z: nil}, + {x: 'a', y: 'A', z: 'a'}, ] - generate_csv(rows) - expected = [ + end + + let(:expected) do + [ { x: 'a', y: nil, z: 'b' }, { x: 'a', y: nil, z: nil }, { x: 'a', y: 'b', z: 'c' }, @@ -30,113 +35,186 @@ { x: 'a', y: nil, z: nil }, { x: 'a', y: 'A', z: nil } ] - result = execute_job(filename: test_csv, - xform: Deduplicate::Fields, - xformopt: { source: :x, targets: %i[y z], multival: true, sep: ';' }) - expect(result).to eq(expected) + end + + let(:test_job_transforms) do + Kiba.job_segment do + transform Deduplicate::Fields, source: :x, targets: %i[y z], multival: true, sep: ';' + end + end + + it 'removes value(s) of source field from target field(s)' do + test_job + expect(output).to eq(expected) end end + context 'when casesensitive = false' do - it 'removes value(s) of source field from target field(s)' do - rows = [ - %w[x y z], - %w[a A a], - %w[a a B] + let(:input) do + [ + { x: 'a', y: 'A', z: 'a' }, + { x: 'a', y: 'a', z: 'B' }, ] - generate_csv(rows) - expected = [ + end + + let(:expected) do + [ { x: 'a', y: nil, z: nil }, { x: 'a', y: nil, z: 'B' } ] - result = execute_job(filename: test_csv, - xform: Deduplicate::Fields, - xformopt: { source: :x, targets: %i[y z], multival: false, - casesensitive: false }) - expect(result).to eq(expected) + end + + let(:test_job_transforms) do + Kiba.job_segment do + transform Deduplicate::Fields, + source: :x, + targets: %i[y z], + multival: true, + sep: ';', + casesensitive: false + end + end + it 'removes value(s) of source field from target field(s)' do + test_job + expect(output).to eq(expected) end end end describe 'FieldValues' do - rows = [ - %w[val x], - ['1;1;1;2;2;2', 'a;A;b;b;b'], - ['', 'q;r;r'], - %w[1 2], - [1, 2] - ] - before do - generate_csv(rows) - end - it 'removes duplicate values in one field (NOT safe for fieldgroups)' do - expected = [ - { val: '1;2', x: 'a;A;b' }, - { val: '', x: 'q;r' }, - { val: '1', x: '2' }, - { val: '1', x: '2' } + let(:input) do + [ + {foo: '1;1;1;2;2;2', bar: 'a;A;b;b;b'}, + {foo: '', bar: 'q;r;r'}, + {foo: '1', bar: '2'}, + {foo: 1, bar: 2} ] - result = execute_job(filename: test_csv, - xform: Deduplicate::FieldValues, - xformopt: { fields: %i[val x], sep: ';' }) - expect(result).to eq(expected) + end + + context 'when deleting deduplication field' do + let(:test_job_transforms) do + Kiba.job_segment do + transform Deduplicate::FieldValues, fields: %i[foo bar], sep: ';' + end + end + + it 'deduplicates values in each field' do + expected = [ + {foo: '1;2', bar: 'a;A;b'}, + {foo: '', bar: 'q;r'}, + {foo: '1', bar: '2'}, + {foo: '1', bar: '2'} + ] + test_job + expect(output).to eq(expected) + end end end describe 'Flag' do - rows = [ - %w[id x], - %w[1 a], - %w[2 a], - %w[1 b], - %w[3 b] - ] - before do - generate_csv(rows) - @deduper = {} - end - it 'adds column with y/n to indicate duplicate records' do - expected = [ - { id: '1', x: 'a', d: 'n' }, - { id: '2', x: 'a', d: 'n' }, - { id: '1', x: 'b', d: 'y' }, - { id: '3', x: 'b', d: 'n' } + let(:input) do + [ + {id: '1', x: 'a'}, + {id: '2', x: 'a'}, + {id: '1', x: 'b'}, + {id: '3', x: 'b'}, ] - opt = { - on_field: :id, - in_field: :d, - using: @deduper - } - result = execute_job(filename: test_csv, - xform: Deduplicate::Flag, - xformopt: opt) - expect(result).to eq(expected) + end + + context 'when deleting deduplication field' do + let(:test_job_transforms) do + Kiba.job_segment do + @deduper = {} + transform Deduplicate::Flag, on_field: :id, in_field: :d, using: @deduper + end + end + it 'deduplicates and removes field' do + expected = [ + { id: '1', x: 'a', d: 'n' }, + { id: '2', x: 'a', d: 'n' }, + { id: '1', x: 'b', d: 'y' }, + { id: '3', x: 'b', d: 'n' } + ] + test_job + expect(output).to eq(expected) + end end end describe 'GroupedFieldValues' do - rows = [ - %w[name role], - ['Fred;Freda;Fred;James', 'author;photographer;editor;illustrator'], - [';', ';'], - %w[1 2] - ] - before do - generate_csv(rows) + let(:test_job_transforms) do + Kiba.job_segment do + transform Deduplicate::GroupedFieldValues, + on_field: :name, + grouped_fields: %i[work role], + sep: ';' + end + end + + let(:input) do + [ + {name: 'Fred;Freda;Fred;James', work: 'Report;Book;Paper;Book', role: 'author;photographer;editor;illustrator'}, + {name: ';', work: ';', role: ';'}, + {name: 'Martha', work: 'Book', role: 'contributor'} + ] end + + let(:expected) do + [ + { name: 'Fred;Freda;James', work: 'Report;Book;Book', role: 'author;photographer;illustrator' }, + { name: nil, work: nil, role: nil }, + {name: 'Martha', work: 'Book', role: 'contributor'} + ] + end + it 'removes duplicate values in one field, and removes corresponding fieldgroup values' do - expected = [ - { name: 'Fred;Freda;James', role: 'author;photographer;illustrator' }, - { name: nil, role: nil }, - { name: '1', role: '2' } + # Helpers::ExampleFormatter.new(input, expected) + test_job + expect(output).to eq(expected) + end + end + + describe 'Table' do + let(:input) do + [ + {foo: 'a', bar: 'b', baz: 'f', combined: 'a b'}, + {foo: 'c', bar: 'd', baz: 'g', combined: 'c d'}, + {foo: 'c', bar: 'e', baz: 'h', combined: 'c e'}, + {foo: 'c', bar: 'd', baz: 'i', combined: 'c d'}, ] - result = execute_job(filename: test_csv, - xform: Deduplicate::GroupedFieldValues, - xformopt: { - on_field: :name, - grouped_fields: %i[role], - sep: ';' - }) - expect(result).to eq(expected) + end + + context 'when deleting deduplication field' do + let(:test_job_transforms) do + Kiba.job_segment do + transform Deduplicate::Table, field: :combined, delete_field: true + end + end + it 'deduplicates and removes field' do + expected = [ + {foo: 'a', bar: 'b', baz: 'f'}, + {foo: 'c', bar: 'd', baz: 'g'}, + {foo: 'c', bar: 'e', baz: 'h'} + ] + test_job + expect(output).to eq(expected) + end + end + + context 'when keeping deduplication field' do + let(:test_job_transforms) do + Kiba.job_segment do + transform Deduplicate::Table, field: :foo + end + end + it 'deduplicates and retains all fields' do + expected = [ + {foo: 'a', bar: 'b', baz: 'f', combined: 'a b'}, + {foo: 'c', bar: 'd', baz: 'g', combined: 'c d'} + ] + test_job + expect(output).to eq(expected) + end end end end diff --git a/spec/kiba/extend/transforms/extract_spec.rb b/spec/kiba/extend/transforms/extract_spec.rb new file mode 100644 index 000000000..ebf228abd --- /dev/null +++ b/spec/kiba/extend/transforms/extract_spec.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Kiba::Extend::Transforms::Extract do + describe 'Fields' do + let(:input) do + [ + {foo: 'a:b', bar: 'e', baz: 'f', boo: ''}, + {foo: 'c', bar: nil, baz: 'g', boo: 'h'}, + {foo: ':d', bar: 'i:', baz: 'j', boo: 'k'} + ] + end + let(:output){ [] } + let(:test_job_config){ { source: input, destination: output } } + let(:test_job) { Kiba::Extend::Jobs::TestingJob.new(files: test_job_config, transformer: test_job_transforms) } + + context 'with sep and source_field_track = true' do + let(:test_job_transforms) do + Kiba.job_segment do + transform Extract::Fields, fields: %i[foo bar], sep: ':' + end + end + it 'extracts split multivalues' do + expected = [ + {value: 'a', from_field: :foo}, + {value: 'b', from_field: :foo}, + {value: 'e', from_field: :bar}, + {value: 'c', from_field: :foo}, + {value: 'd', from_field: :foo}, + {value: 'i', from_field: :bar} + ] + test_job + expect(output).to eq(expected) + end + end + + context 'with no sep and source_field_track = false' do + let(:test_job_transforms) do + Kiba.job_segment do + transform Extract::Fields, fields: %i[foo bar], source_field_track: false + end + end + it 'extracts multivalues without splitting' do + expected = [ + {value: 'a:b'}, + {value: 'e'}, + {value: 'c'}, + {value: ':d'}, + {value: 'i:'} + ] + test_job + expect(output).to eq(expected) + end + end + end +end diff --git a/spec/kiba/extend/transforms/helpers_spec.rb b/spec/kiba/extend/transforms/helpers_spec.rb index 9c9000453..73d854b12 100644 --- a/spec/kiba/extend/transforms/helpers_spec.rb +++ b/spec/kiba/extend/transforms/helpers_spec.rb @@ -84,14 +84,16 @@ end describe '#field_values' do - let(:row) { { - a: nil, - b: '', - c: ';', - d: 'foo', - e: '%NULLVALUE%', - f: '%NULLVALUE%;%NULLVALUE%' - } } + let(:row) { + { + a: nil, + b: '', + c: ';', + d: 'foo', + e: '%NULLVALUE%', + f: '%NULLVALUE%;%NULLVALUE%' + } + } let(:fields) { %i[a b c d e f] } let(:discard) { %i[nil empty delim] } let(:delim) { ';' } diff --git a/spec/kiba/extend/transforms/merge_spec.rb b/spec/kiba/extend/transforms/merge_spec.rb index e7d657daa..5ed5353da 100644 --- a/spec/kiba/extend/transforms/merge_spec.rb +++ b/spec/kiba/extend/transforms/merge_spec.rb @@ -3,7 +3,6 @@ require 'spec_helper' RSpec.describe Kiba::Extend::Transforms::Merge do - before do generate_csv(rows) end diff --git a/spec/kiba/extend/transforms/prepend_spec.rb b/spec/kiba/extend/transforms/prepend_spec.rb index af7828462..75902a731 100644 --- a/spec/kiba/extend/transforms/prepend_spec.rb +++ b/spec/kiba/extend/transforms/prepend_spec.rb @@ -25,9 +25,11 @@ opts = { target_field: :name, prepended_field: :prependval, sep: ':', multivalue_prepended_field: true } - expect {execute_job(filename: test_csv, - xform: Prepend::FieldToFieldValue, - xformopt: opts)}.to raise_error(msg) + expect { + execute_job(filename: test_csv, + xform: Prepend::FieldToFieldValue, + xformopt: opts) + }.to raise_error(msg) end end diff --git a/spec/kiba/extend/transforms/split_spec.rb b/spec/kiba/extend/transforms/split_spec.rb index 30ed76f8a..e9f8fa710 100644 --- a/spec/kiba/extend/transforms/split_spec.rb +++ b/spec/kiba/extend/transforms/split_spec.rb @@ -4,21 +4,23 @@ RSpec.describe Kiba::Extend::Transforms::Split do describe 'IntoMultipleColumns' do - before(:each) do generate_csv(rows) end context 'without max_segments param' do - let(:rows) { [ - %w[summary], - [''] - ] } - let(:result) { execute_job(filename: test_csv, - xform: Split::IntoMultipleColumns, - xformopt: { - field: :summary, - sep: ':' - }) + let(:rows) { + [ + %w[summary], + [''] + ] + } + let(:result) { + execute_job(filename: test_csv, + xform: Split::IntoMultipleColumns, + xformopt: { + field: :summary, + sep: ':' + }) } it 'raises ArgumentError with expected message' do expect { result }.to raise_error(ArgumentError, 'missing keyword: :max_segments') @@ -26,12 +28,14 @@ end context 'when sep = : and value = a:b and c' do - let(:rows) { [ - %w[summary], - ['a:b'], - ['c'], - [':d'] - ] } + let(:rows) { + [ + %w[summary], + ['a:b'], + ['c'], + [':d'] + ] + } context 'with max_segments = 2' do it 'fills in blank field before @sep with empty string and empty extra columns to the right with nil' do @@ -54,13 +58,15 @@ context 'and max_segments = 3' do context 'and value = a:b:c:d:e' do - let(:rows) { [ - %w[summary], - ['a:b:c:d:e'], - ['f:g'], - [''], - [nil] - ] } + let(:rows) { + [ + %w[summary], + ['a:b:c:d:e'], + ['f:g'], + [''], + [nil] + ] + } context 'and collapse_on = :right' do context 'and no warnfield given' do it 'collapses on right' do diff --git a/spec/kiba/extend/transforms/take_spec.rb b/spec/kiba/extend/transforms/take_spec.rb index 5e6d9c0f8..cffff154a 100644 --- a/spec/kiba/extend/transforms/take_spec.rb +++ b/spec/kiba/extend/transforms/take_spec.rb @@ -3,7 +3,6 @@ require 'spec_helper' RSpec.describe Kiba::Extend::Transforms::Take do - let(:result) { execute_job(filename: test_csv, xform: transform, xformopt: opts) } before do generate_csv(rows) @@ -14,13 +13,15 @@ describe 'First' do let(:transform) { Take::First } - let(:rows) { [ - %w[a b], - ['c|d', 'e|j'], - ['', nil], - ['|f', 'g|'], - ['h', 'i'] - ] } + let(:rows) { + [ + %w[a b], + ['c|d', 'e|j'], + ['', nil], + ['|f', 'g|'], + ['h', 'i'] + ] + } context 'when a, b -> y, z' do let(:opts) { { fields: %i[a b], targets: %i[y z], delim: '|' } } diff --git a/spec/kiba/extend/utils/lookup_spec.rb b/spec/kiba/extend/utils/lookup_spec.rb index 4461a9606..cbb05c75c 100644 --- a/spec/kiba/extend/utils/lookup_spec.rb +++ b/spec/kiba/extend/utils/lookup_spec.rb @@ -14,21 +14,6 @@ after { File.delete(test_csv) if File.exist?(test_csv) } describe '#csv_to_hash' do - lookup_hash = { - '1' => { id: '1', val: 'a' }, - '2' => { id: '2', val: 'b' }, - '3' => { id: '3', val: 'd' } - } - - it 'returns hash with key = keycolumn value and value = last occurring row w/that key ' do - result = Lookup.csv_to_hash(file: test_csv, - csvopt: CSVOPT, - keycolumn: :id) - expect(result).to eq(lookup_hash) - end - end - - describe '#csv_to_multi_hash' do lookup_hash = { '1' => [{ id: '1', val: 'a' }], '2' => [{ id: '2', val: 'b' }],