diff --git a/src/CatArrays.jl b/src/CatArrays.jl index 7126a122..e69de29b 100644 --- a/src/CatArrays.jl +++ b/src/CatArrays.jl @@ -1,207 +0,0 @@ -module CatArrays -using Base -import NCDatasets - -mutable struct CatArray{T,N,M,TA} <: AbstractArray{T,N} where TA <: AbstractArray - # dimension over which the sub-arrays are concatenated - dim::Int - # tuple of all sub-arrays - arrays::NTuple{M,TA} - # offset indices of every subarrays in the combined array - # (0-based, i.e. 0 = no offset) - offset::NTuple{M,NTuple{N,Int}} - # size of the combined array - sz::NTuple{N,Int} -end - -""" -C = CatArray(dim,array1,array2,...) - -Create a concatenated array view from a list of arrays. Individual -elements can be accessed by subscribs -""" -function CatArray(dim::Int,arrays...) - M = length(arrays) - - # number of dimensions - N = ndims(arrays[1]) - if dim > N - N = dim - end - - # check if dimensions are consistent - for i = 2:M - for j = 1:N - if j !== dim - if size(arrays[i],j) !== size(arrays[1],j) - error("Array number $i has inconsistent size $(size(arrays[i]))") - end - end - end - end - - # offset of each sub-array - countdim = 0 - offset = ntuple(M) do i - off = ntuple(j -> (j == dim ? countdim : 0), N) - countdim += size(arrays[i],dim) - off - end - - # size of concatenated array - sz = ntuple(j -> (j == dim ? countdim : size(arrays[1],j)), N) - - TA = typeof(arrays[begin]) - T = eltype(arrays[begin]) - for i = (firstindex(arrays)+1):lastindex(arrays) - T = promote_type(T,eltype(arrays[i])) - TA = promote_type(TA,typeof(arrays[i])) - end - - return CatArray{T,N,M,TA}( - dim, - arrays, - offset, - sz) -end - - -function Base.getindex(CA::CatArray{T,N},idx...) where {T,N} - checkbounds(CA,idx...) - - sz = NCDatasets._shape_after_slice(size(CA),idx...) - idx_global_local = index_global_local(CA,idx) - B = Array{T,length(sz)}(undef,sz...) - - for (array,(idx_global,idx_local)) in zip(CA.arrays,idx_global_local) - if valid_local_idx(idx_local...) - # get subset from subarray - subset = array[idx_local...] - B[idx_global...] = subset - end - end - - if sz == () - # scalar - return B[] - else - return B - end -end - -Base.size(CA::CatArray) = CA.sz - - -function valid_local_idx(local_idx::Integer,local_idx_rest...) - return (local_idx > 0) && valid_local_idx(local_idx_rest...) -end - -function valid_local_idx(local_idx::AbstractVector,local_idx_rest...) - return (length(local_idx) > 0) && valid_local_idx(local_idx_rest...) -end - -# stop condition -valid_local_idx() = true - -# iterate thought all indices by peeling the first value of a tuple off -# which results in better type stability -function gli(offset,subarray,idx) - idx_global_tmp,idx_local_tmp = _gli(offset,subarray,1,(),(),idx...) - return idx_global_tmp,idx_local_tmp -end - -function _gli(offset,subarray,i,idx_global,idx_local,idx::Integer,idx_rest...) - # rebase subscribt - idx_offset = idx - offset[i] - # only indeces within bounds of the subarray - if !(1 <= idx_offset <= size(subarray,i)) - idx_offset = -1 - end - - # scalar indices are not part of idx_global (as they are dropped) - return _gli(offset,subarray,i+1,idx_global,(idx_local...,idx_offset),idx_rest...) -end - -function _gli(offset,subarray,i,idx_global,idx_local,idx::Colon,idx_rest...) - idx_global_tmp = offset[i] .+ (1:size(subarray,i)) - idx_local_tmp = 1:size(subarray,i) - - return _gli(offset,subarray,i+1, - (idx_global...,idx_global_tmp), - (idx_local..., idx_local_tmp),idx_rest...) -end - -function _gli(offset,subarray,i,idx_global,idx_local,idx::AbstractRange,idx_rest...) - # within bounds of the subarray - within(j) = 1 <= j <= size(subarray,i) - - # rebase subscript - idx_offset = idx .- offset[i] - - n_within = count(within,idx_offset) - - if n_within == 0 - idx_local_tmp = 1:0 - idx_global_tmp = 1:0 - else - # index for getting the data from the local array - idx_local_tmp = idx_offset[findfirst(within,idx_offset):findlast(within,idx_offset)] - idx_global_tmp = (1:n_within) .+ (findfirst(within,idx_offset) - 1) - end - - return _gli(offset,subarray,i+1, - (idx_global...,idx_global_tmp), - (idx_local..., idx_local_tmp),idx_rest...) -end - -function _gli(offset,subarray,i,idx_global,idx_local,idx::Vector{T},idx_rest...) where T <: Integer - # within bounds of the subarray - within(j) = 1 <= j <= size(subarray,i) - - # rebase subscribt - idx_offset = idx .- offset[i] - - # index for getting the data from the local array - idx_local_tmp = filter(within,idx_offset) - idx_global_tmp = filter(i -> within(idx_offset[i]),1:length(idx)) - - return _gli(offset,subarray,i+1, - (idx_global...,idx_global_tmp), - (idx_local..., idx_local_tmp),idx_rest...) -end - - -# stop condition -# all indices have been processed -_gli(offset,subarray,i,idx_global,idx_local) = (idx_global,idx_local) - -function index_global_local(CA::CatArray{T,N,M,TA},idx) where {T,N,M,TA} - # number of indices must be equal to dimension - @assert(length(idx) == N) - - idx_global_local = ntuple(j -> gli(CA.offset[j],CA.arrays[j],idx),M) - - return idx_global_local -end - -function Base.setindex!(CA::CatArray{T,N},data,idx...) where {T,N} - idx_global_local = index_global_local(CA,idx) - @debug ind,idx_global,idx_local,sz - - for (array,(idx_global,idx_local)) in zip(CA.arrays,idx_global_local) - if valid_local_idx(idx_local...) - subset = @view data[idx_global...] - @debug idx_local - # set subset in array - array[idx_local...] = subset - end - end - - return data -end - -# load all the data at once -Base.Array(CA::CatArray{T,N}) where {T,N} = CA[ntuple(i -> :, Val(N))...] - -export CatArray -end diff --git a/src/NCDatasets.jl b/src/NCDatasets.jl index dde8391d..6ea7e108 100644 --- a/src/NCDatasets.jl +++ b/src/NCDatasets.jl @@ -29,16 +29,24 @@ using Printf using CommonDataModel using CommonDataModel: dims, attribs, groups import CommonDataModel: AbstractDataset, AbstractVariable, - boundsParentVar, + boundsParentVar, initboundsmap!, fillvalue, fill_and_missing_values, scale_factor, add_offset, time_origin, time_factor, CFtransformdata!, CFVariable, variable, cfvariable, defVar, path, name, isopen, unlimited, dataset, - groupnames, group, defGroup, + groupname, groupnames, group, defGroup, dimnames, dim, defDim, attribnames, attrib, defAttrib, - varbyattrib, CFStdName, @CF_str, ancillaryvariables, filter, coord, bounds + varbyattrib, CFStdName, @CF_str, ancillaryvariables, filter, coord, bounds, + MFDataset, MFCFVariable, + DeferDataset, metadata, Resource, + SubDataset, SubVariable, subsub, + chunking, deflate, checksum, fillmode, + iswritable, sync, CatArrays, + SubDataset, + @select, select, Near, coordinate_value, coordinate_names, split_by_and + import DiskArrays import DiskArrays: readblock!, writeblock!, eachchunk, haschunks using DiskArrays: @implement_diskarray diff --git a/src/cfvariable.jl b/src/cfvariable.jl index 12e29c96..cb1087f9 100644 --- a/src/cfvariable.jl +++ b/src/cfvariable.jl @@ -283,14 +283,6 @@ function Base.getindex(ds::AbstractNCDataset,varname::SymbolOrString) return cfvariable(ds, varname) end -""" - dimnames(v::CFVariable) - -Return a tuple of strings with the dimension names of the variable `v`. -""" -dimnames(v::Union{CFVariable,MFCFVariable}) = dimnames(v.var) - - """ dimsize(v::CFVariable) Get the size of a `CFVariable` as a named tuple of dimension → length. @@ -300,164 +292,6 @@ function dimsize(v::Union{CFVariable,MFCFVariable,SubVariable}) names = Symbol.(dimnames(v)) return NamedTuple{names}(s) end -export dimsize - - -name(v::Union{CFVariable,MFCFVariable}) = name(v.var) -chunking(v::CFVariable,storage,chunksize) = chunking(v.var,storage,chunksize) -chunking(v::CFVariable) = chunking(v.var) - -deflate(v::CFVariable,shuffle,dodeflate,deflate_level) = deflate(v.var,shuffle,dodeflate,deflate_level) -deflate(v::CFVariable) = deflate(v.var) - -checksum(v::CFVariable,checksummethod) = checksum(v.var,checksummethod) -checksum(v::CFVariable) = checksum(v.var) - - -fillmode(v::CFVariable) = fillmode(v.var) - - - -############################################################ -# CFVariable -############################################################ - -# indexing with vector of integers - -to_range_list(index::Integer,len) = index - -to_range_list(index::Colon,len) = [1:len] -to_range_list(index::AbstractRange,len) = [index] - -function to_range_list(index::Vector{T},len) where T <: Integer - grow(istart) = istart[begin]:(istart[end]+step(istart)) - - baseindex = 1 - indices_ranges = UnitRange{T}[] - - while baseindex <= length(index) - range = index[baseindex]:index[baseindex] - range_test = grow(range) - index_view = @view index[baseindex:end] - - while checkbounds(Bool,index_view,length(range_test)) && - (range_test[end] == index_view[length(range_test)]) - - range = range_test - range_test = grow(range_test) - end - - push!(indices_ranges,range) - baseindex += length(range) - end - - @assert reduce(vcat,indices_ranges,init=T[]) == index - return indices_ranges -end - -_range_indices_dest(of) = of -_range_indices_dest(of,i::Integer,rest...) = _range_indices_dest(of,rest...) - -function _range_indices_dest(of,v,rest...) - b = 0 - ind = similar(v,0) - for r in v - rr = 1:length(r) - push!(ind,b .+ rr) - b += length(r) - end - - _range_indices_dest((of...,ind),rest...) -end -range_indices_dest(ri...) = _range_indices_dest((),ri...) - -function Base.getindex(v::Union{MFVariable,SubVariable},indices::Union{Int,Colon,AbstractRange{<:Integer},Vector{Int}}...) - @debug "transform vector of indices to ranges" - - sz_source = size(v) - ri = to_range_list.(indices,sz_source) - sz_dest = NCDatasets._shape_after_slice(sz_source,indices...) - - N = length(indices) - - ri_dest = range_indices_dest(ri...) - @debug "ri_dest $ri_dest" - @debug "ri $ri" - - if all(==(1),length.(ri)) - # single chunk - R = first(CartesianIndices(length.(ri))) - ind_source = ntuple(i -> ri[i][R[i]],N) - ind_dest = ntuple(i -> ri_dest[i][R[i]],length(ri_dest)) - return v[ind_source...] - end - - dest = Array{eltype(v),length(sz_dest)}(undef,sz_dest) - for R in CartesianIndices(length.(ri)) - ind_source = ntuple(i -> ri[i][R[i]],N) - ind_dest = ntuple(i -> ri_dest[i][R[i]],length(ri_dest)) - #dest[ind_dest...] = v[ind_source...] - if hasproperty(v,:var) - buffer = Array{eltype(v.var),length(ind_dest)}(undef,length.(ind_dest)) - load!(v,view(dest,ind_dest...),buffer,ind_source...) - else - dest[ind_dest...] = v[ind_source...] - end - end - return dest -end - - -############################################################ -# Convertion to array -############################################################ - -Base.Array(v::AbstractVariable{T,N}) where {T,N} = v[ntuple(i -> :, Val(N))...] - - -""" - NCDatasets.load!(ncvar::CFVariable, data, buffer, indices) - -Loads a NetCDF variables `ncvar` in-place and puts the result in `data` (an -array of `eltype(ncvar)`) along the specified `indices`. `buffer` is a temporary - array of the same size as data but the type should be `eltype(ncv.var)`, i.e. -the corresponding type in the NetCDF files (before applying `scale_factor`, -`add_offset` and masking fill values). Scaling and masking will be applied to -the array `data`. - -`data` and `buffer` can be the same array if `eltype(ncvar) == eltype(ncvar.var)`. - -## Example: - -```julia -# create some test array -Dataset("file.nc","c") do ds - defDim(ds,"time",3) - ncvar = defVar(ds,"vgos",Int16,("time",),attrib = ["scale_factor" => 0.1]) - ncvar[:] = [1.1, 1.2, 1.3] - # store 11, 12 and 13 as scale_factor is 0.1 -end - - -ds = Dataset("file.nc") -ncv = ds["vgos"]; -# data and buffer must have the right shape and type -data = zeros(eltype(ncv),size(ncv)); # here Vector{Float64} -buffer = zeros(eltype(ncv.var),size(ncv)); # here Vector{Int16} -NCDatasets.load!(ncv,data,buffer,:,:,:) -close(ds) -``` -""" -@inline function load!(v::Union{CFVariable{T,N},MFCFVariable{T,N},SubVariable{T,N}}, data, buffer, indices::Union{Integer, AbstractRange{<:Integer}, Colon}...) where {T,N} - - if v.var == nothing - return load!(v,indices...) - else - load!(v.var,buffer,indices...) - fmv = fill_and_missing_values(v) - return CFtransformdata!(data,buffer,fmv,scale_factor(v),add_offset(v), - time_origin(v),time_factor(v)) - end -end +export dimsize diff --git a/src/dataset.jl b/src/dataset.jl index 6cb6dad2..5e41a931 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -88,19 +88,6 @@ function defmode(ds::Dataset) end end -"Initialize the ds._boundsmap variable" -function initboundsmap!(ds) - ds._boundsmap = Dict{String,String}() - for vname in keys(ds) - v = variable(ds,vname) - bounds = get(v.attrib,"bounds",nothing) - - if bounds !== nothing - ds._boundsmap[bounds] = vname - end - end -end - ############################################################ # High-level ############################################################ diff --git a/src/defer.jl b/src/defer.jl index eaf81c58..0f328f2a 100644 --- a/src/defer.jl +++ b/src/defer.jl @@ -1,89 +1,8 @@ -iswritable(dds::DeferDataset) = dds.r.mode != "r" -function metadata(ds::NCDataset) - # dimensions +DeferDataset(filename::AbstractString,mode::AbstractString = "r") = DeferDataset(NCDataset,filename::AbstractString,mode) - dim = OrderedDict() - unlimited_dims = unlimited(ds.dim) - - for (dimname,dimlen) in ds.dim - dim[dimname] = Dict( - :name => dimname, - :length => dimlen, - :unlimited => dimname in unlimited_dims - ) - end - - # variables - vars = OrderedDict() - - for (varname,ncvar) in ds - storage,chunksizes = chunking(ncvar.var) - isshuffled,isdeflated,deflatelevel = deflate(ncvar.var) - checksummethod = checksum(ncvar.var) - - vars[varname] = OrderedDict( - :name => varname, - :size => size(ncvar), - :eltype => eltype(ncvar.var), - :attrib => OrderedDict(ncvar.attrib), - :dimensions => dimnames(ncvar), - :chunksize => chunksizes, - :storage => storage, - :fillvalue => fillvalue(ncvar.var), - :shuffle => isshuffled, - :deflate => isdeflated, - :deflatelevel => deflatelevel, - :checksummethod => checksummethod, - ) - end - - group = OrderedDict() - for (groupname,ncgroup) in ds.group - group[groupname] = metadata(ncgroup) - end - - return OrderedDict( - :dim => dim, - :var => vars, - :attrib => OrderedDict(ds.attrib), - :group => group - ) -end - - -function DeferDataset(r::Resource,groupname::String,data::OrderedDict) - _boundsmap = nothing - dds = DeferDataset(r,groupname,data,_boundsmap) - if (r.mode == "r") - initboundsmap!(dds) - end - return dds -end - -function DeferDataset(filename,mode,info) - NCDataset(filename,mode) do ds - r = Resource(filename,mode,info) - groupname = "/" - return DeferDataset(r,groupname,info) - end -end - -function DeferDataset(filename,mode = "r") - NCDataset(filename,mode) do ds - info = metadata(ds) - r = Resource(filename,mode,info) - groupname = "/" - return DeferDataset(r,groupname,info) - end -end export DeferDataset -# files are not suppose to be open where using DeferDataset -close(dds::DeferDataset) = nothing -groupname(dds::DeferDataset) = dds.groupname -path(dds::DeferDataset) = dds.r.filename -Base.keys(dds::DeferDataset) = collect(keys(dds.data[:var])) function NCDataset(f::Function, r::Resource) NCDataset(r.filename,r.mode) do ds @@ -96,76 +15,3 @@ function NCDataset(f::Function, dds::DeferDataset) f(ds) end end - -function Variable(f::Function, dv::DeferVariable) - NCDataset(dv.r.filename,dv.r.mode) do ds - f(variable(ds,dv.varname)) - end -end - -function variable(dds::DeferDataset,varname::AbstractString) - data = get(dds.data[:var],varname,nothing) - if data == nothing - error("Dataset $(dds.r.filename) does not contain the variable $varname") - end - T = data[:eltype] - N = length(data[:dimensions]) - - return DeferVariable{T,N}(dds.r,varname,data) -end - -variable(dds::DeferDataset,varname::Symbol) = variable(dds,string(varname)) - -dataset(dv::DeferVariable) = DeferDataset(dv.r.filename) - -function Base.getindex(dv::DeferVariable,indexes::Union{Int,Colon,AbstractRange{<:Integer}}...) - Variable(dv) do v - return v[indexes...] - end -end - - -Base.size(dv::DeferVariable) = dv.data[:size] -dimnames(dv::DeferVariable) = dv.data[:dimensions] -name(dv::DeferVariable) = dv.varname - -#---------------------------------------------- - -dimnames(dds::DeferDataset) = collect(keys(dds.r.metadata[:dim])) - -dim(dds::DeferDataset,name::SymbolOrString) = - dds.r.metadata[:dim][String(name)][:length] - -unlimited(dd::DeferDataset) = [dimname for (dimname,dim) in dd.data[:dim] if dim[:unlimited]] - - -attribnames(dds::DeferDataset) = collect(keys(dds.r.metadata[:attrib])) -attrib(dds::DeferDataset,name::SymbolOrString) = dds.r.metadata[:attrib][String(name)] - - -attribnames(dv::DeferVariable) = collect(keys(dv.data[:attrib])) -attrib(dv::DeferVariable,name::SymbolOrString) = dv.data[:attrib][String(name)] - -#------------------------------------------------ - -groupnames(dds::DeferDataset) = collect(keys(dds.data[:group])) - -function group(dds::DeferDataset,name::SymbolOrString) - data = dds.data[:group][String(name)] - return DeferDataset(dds.r,String(name),data) -end - - -_storage_attributes(dv) = dv.r.metadata[:var][name(dv)] - -function chunking(dv::DeferVariable) - sa = _storage_attributes(dv) - return sa[:storage],sa[:chunksize] -end - -function deflate(dv::DeferVariable) - sa = _storage_attributes(dv) - return sa[:shuffle],sa[:deflate],sa[:deflatelevel] -end - -checksum(dv::DeferVariable) = _storage_attributes(dv)[:checksummethod] diff --git a/src/multifile.jl b/src/multifile.jl index 0ebcdfcb..3779b9c6 100644 --- a/src/multifile.jl +++ b/src/multifile.jl @@ -1,78 +1,3 @@ - -attribnames(ds::MFDataset) = attribnames(ds.ds[1]) - -attrib(ds::MFDataset,name::SymbolOrString) = attrib(ds.ds[1],name) - -attribnames(v::Union{MFCFVariable,MFVariable}) = attribnames(variable(v.ds.ds[1],v.varname)) - -attrib(v::Union{MFCFVariable,MFVariable},name::SymbolOrString) = attrib(variable(v.ds.ds[1],v.varname),name) - -function defAttrib(v::Union{MFCFVariable,MFVariable},name::SymbolOrString,data) - for ds in v.ds.ds - defAttrib(variable(v.ds,v.varname),name,data) - end - return data -end - -function defAttrib(ds::MFDataset,name::SymbolOrString,data) - for _ds in ds.ds - defAttrib(_ds,name,data) - end - return data -end - -function dim(ds::MFDataset,name::SymbolOrString) - if name == ds.aggdim - if ds.isnewdim - return length(ds.ds) - else - return sum(dim(_ds,name) for _ds in ds.ds) - end - else - return dim(ds.ds[1],name) - end -end - -function defDim(ds::MFDataset,name::SymbolOrString,data) - for _ds in ds.ds - defDim(_ds,name,data) - end - return data -end - -function dimnames(ds::MFDataset) - k = collect(dimnames(ds.ds[1])) - - if ds.isnewdim - push!(k,ds.aggdim) - end - - return k -end - -unlimited(ds::MFDataset) = unique(reduce(hcat,unlimited.(ds.ds))) - -groupnames(ds::MFDataset) = groupnames(ds.ds[1]) - -function group(mfds::MFDataset,name::SymbolOrString) - ds = group.(mfds.ds,name) - constvars = Symbol[] - return MFDataset(ds,mfds.aggdim,mfds.isnewdim,constvars) -end - -Base.Array(v::MFVariable) = Array(v.var) - -iswritable(mfds::MFDataset) = iswritable(mfds.ds[1]) - -function MFDataset(ds,aggdim,isnewdim,constvars) - _boundsmap = Dict{String,String}() - mfds = MFDataset(ds,aggdim,isnewdim,constvars,_boundsmap) - if !iswritable(mfds) - initboundsmap!(mfds) - end - return mfds -end - """ mfds = NCDataset(fnames, mode = "r"; aggdim = nothing, deferopen = true, isnewdim = false, @@ -176,144 +101,5 @@ function NCDataset(fnames::AbstractArray{TS,N},mode = "r"; aggdim = nothing, return mfds end -function close(mfds::MFDataset) - close.(mfds.ds) - return nothing -end - -function sync(mfds::MFDataset) - sync.(mfds.ds) - return nothing -end - -function path(mfds::MFDataset) - path(mfds.ds[1]) * "…" * path(mfds.ds[end]) -end -groupname(mfds::MFDataset) = groupname(mfds.ds[1]) - -function Base.keys(mfds::MFDataset) - if mfds.aggdim == "" - return unique(Iterators.flatten(keys.(mfds.ds))) - else - keys(mfds.ds[1]) - end -end - -Base.getindex(v::MFVariable,indexes::Union{Int,Colon,AbstractRange{<:Integer}}...) = getindex(v.var,indexes...) -Base.setindex!(v::MFVariable,data,indexes::Union{Int,Colon,AbstractRange{<:Integer}}...) = setindex!(v.var,data,indexes...) - -Base.size(v::MFVariable) = size(v.var) -Base.size(v::MFCFVariable) = size(v.var) -dimnames(v::MFVariable) = v.dimnames -name(v::MFVariable) = v.varname - - -function variable(mfds::MFDataset,varname::SymbolOrString) - if mfds.isnewdim - if Symbol(varname) in mfds.constvars - return variable(mfds.ds[1],varname) - end - # aggregated along a given dimension - vars = variable.(mfds.ds,varname) - v = CatArrays.CatArray(ndims(vars[1])+1,vars...) - return MFVariable(mfds,v, - (dimnames(vars[1])...,mfds.aggdim),String(varname)) - elseif mfds.aggdim == "" - # merge all variables - - # the latest dataset should be used if a variable name is present multiple times - for ds in reverse(mfds.ds) - if haskey(ds,varname) - return variable(ds,varname) - end - end - else - # aggregated along a given dimension - vars = variable.(mfds.ds,varname) - - dim = findfirst(dimnames(vars[1]) .== mfds.aggdim) - @debug "dimension $dim" - - if (dim != nothing) - v = CatArrays.CatArray(dim,vars...) - return MFVariable(mfds,v, - dimnames(vars[1]),String(varname)) - else - return vars[1] - end - end -end - -function cfvariable(mfds::MFDataset,varname::SymbolOrString) - if mfds.isnewdim - if Symbol(varname) in mfds.constvars - return cfvariable(mfds.ds[1],varname) - end - # aggregated along a given dimension - cfvars = cfvariable.(mfds.ds,varname) - cfvar = CatArrays.CatArray(ndims(cfvars[1])+1,cfvars...) - var = variable(mfds,varname) - - return MFCFVariable(mfds,cfvar,var, - dimnames(var),varname) - elseif mfds.aggdim == "" - # merge all variables - - # the latest dataset should be used if a variable name is present multiple times - for ds in reverse(mfds.ds) - if haskey(ds,varname) - return cfvariable(ds,varname) - end - end - else - # aggregated along a given dimension - cfvars = cfvariable.(mfds.ds,varname) - - dim = findfirst(dimnames(cfvars[1]) .== mfds.aggdim) - @debug "dim $dim" - - if (dim != nothing) - cfvar = CatArrays.CatArray(dim,cfvars...) - var = variable(mfds,varname) - - return MFCFVariable(mfds,cfvar,var, - dimnames(var),String(varname)) - else - return cfvars[1] - end - end -end - - -fillvalue(v::Union{MFVariable{T},MFCFVariable{T}}) where T = v.attrib["_FillValue"]::T -dataset(v::Union{MFVariable,MFCFVariable}) = v.ds - - -Base.getindex(v::MFCFVariable,ind...) = v.cfvar[ind...] -Base.setindex!(v::MFCFVariable,data,ind...) = v.cfvar[ind...] = data - - -function Base.cat(vs::AbstractVariable...; dims::Integer) - CatArrays.CatArray(dims,vs...) -end - -""" - storage,chunksizes = chunking(v::MFVariable) - -Return the storage type (`:contiguous` or `:chunked`) and the chunk sizes of the varable -`v` corresponding to the first NetCDF file. If the first NetCDF file in the collection -is chunked then this storage attributes are returns. If not the first NetCDF file is not contiguous, then multi-file variable is still reported as chunked with chunk size equal to the variable size. -""" -function chunking(v::MFVariable) - storage,chunksizes = chunking(v.ds.ds[1][name(v)]) - - if chunksizes == :contiguous - return (:chunked, collect(size(v))) - else - return storage,chunksizes - end -end - -deflate(v::MFVariable) = deflate(v.ds.ds[1][name(v)]) -checksum(v::MFVariable) = checksum(v.ds.ds[1][name(v)]) +NCDataset(ds::MFCFVariable) = dataset(ds) diff --git a/src/select.jl b/src/select.jl index 38218a86..e69de29b 100644 --- a/src/select.jl +++ b/src/select.jl @@ -1,310 +0,0 @@ -import Base: findfirst - -struct Near{TTarget,TTolerance} - target::TTarget - tolerance::TTolerance -end - -Near(target::Number) = Near(target,nothing) - - -function _findfirst(n::Near,v::AbstractVector) - diff, ind = findmin(x -> abs(x - n.target),v) - - if (n.tolerance != nothing) && (diff > n.tolerance) - return Int[] - else - return ind - end -end - -function scan_exp!(exp::Symbol,found) - newsym = gensym() - push!(found,exp => newsym) - return newsym -end - -function scan_exp!(exp::Expr,found) - if exp.head == :$ - return exp.args[1] - end - - if exp.head == :call - # skip function name - return Expr(exp.head,exp.args[1],scan_exp!.(exp.args[2:end],Ref(found))...) - elseif exp.head == :comparison - # :(3 <= lon <= 7.2) - args = Vector{Any}(undef,length(exp.args)) - for i = 1:length(exp.args) - if iseven(i) - # skip inflix operators - args[i] = exp.args[i] - else - args[i] = scan_exp!(exp.args[i],found) - end - end - return Expr(exp.head,args...) - else - return Expr(exp.head,scan_exp!.(exp.args[1:end],Ref(found))...) - end -end - -# neither Expr nor Symbol -scan_exp!(exp,found) = exp - -function scan_exp(exp::Expr) - found = Pair{Symbol,Symbol}[] - exp = scan_exp!(exp,found) - return found,exp -end - -function scan_coordinate_name(exp) - params,exp = scan_exp(exp) - - if length(params) != 1 - error("Multiple (or none) coordinates in expression $exp ($params) while looking for $(coordinate_names).") - end - param = params[1] - return param,exp -end - - -function split_by_and!(exp,sub_exp) - if exp.head == :&& - split_by_and!(exp.args[1],sub_exp) - split_by_and!(exp.args[2],sub_exp) - else - push!(sub_exp,exp) - end - return sub_exp -end - -split_by_and(exp) = split_by_and!(exp,[]) - -_intersect(r1::AbstractVector,r2::AbstractVector) = intersect(r1,r2) -_intersect(r1::AbstractVector,r2::Number) = (r2 in r1 ? r2 : []) -_intersect(r1::Number,r2::Number) = (r2 == r1 ? r2 : []) -_intersect(r1::Colon,r2) = r2 -_intersect(r1::Colon,r2::AbstractRange) = r2 - - - - - -""" - vsubset = NCDatasets.@select(v,expression) - dssubset = NCDatasets.@select(ds,expression) - -Return a subset of the variable `v` (or dataset `ds`) satisfying the condition `expression` as a view. The condition -has the following form: - -`condition₁ && condition₂ && condition₃ ... conditionₙ` - -Every condition should involve a single 1D NetCDF variable (typically a coordinate variable, referred as `coord` below). If `v` -is a variable, the related 1D NetCDF variable should have a shared dimension with the variable `v`. All local variables need to have a `\$` prefix (see examples below). This macro is experimental and subjected to change. - -Every condition can either perform: - -* a nearest match: `coord ≈ target_coord` (for `≈` type `\\approx` followed by the TAB-key). Only the data corresponding to the index closest to `target_coord` is loaded. - -* a nearest match with tolerance: `coord ≈ target_coord ± tolerance`. As before, but if the difference between the closest value in `coord` and `target_coord` is larger (in absolute value) than `tolerance`, an empty array is returned. - -* a condition operating on scalar values. For example, a `condition` equal to `10 <= lon <= 20` loads all data with the longitude between 10 and 20 or `abs(lat) > 60` loads all variables with a latitude north of 60° N and south of 60° S (assuming that the NetCDF has the 1D variables `lon` and `lat` for longitude and latitude). - -Only the data which satisfies all conditions is loaded. All conditions must be chained with an `&&` (logical and). They should not contain additional parenthesis or other logical operators such as `||` (logical or). - -To convert the view into a regular array one can use `collect`, `Array` or regular indexing. -As in julia, views of scalars are wrapped into a zero dimensional arrays which can be dereferenced by using `[]`. Modifying a view will modify the underlying NetCDF file (if -the file is opened as writable, otherwise an error is issued). - -As for any view, one can use `parentindices(vsubset)` to get the indices matching a select query. - -## Examples - -Create a sample file with random data: - -```julia -using NCDatasets, Dates -fname = "sample_file.nc" -lon = -180:180 -lat = -90:90 -time = DateTime(2000,1,1):Day(1):DateTime(2000,1,3) -SST = randn(length(lon),length(lat),length(time)) - -ds = NCDataset(fname,"c") -defVar(ds,"lon",lon,("lon",)); -defVar(ds,"lat",lat,("lat",)); -defVar(ds,"time",time,("time",)); -defVar(ds,"SST",SST,("lon","lat","time")); - - -# load by bounding box -v = NCDatasets.@select(ds["SST"],30 <= lon <= 60 && 40 <= lat <= 90) - -# substitute a local variable in condition using \$ -lonr = (30,60) # longitude range -latr = (40,90) # latitude range - -v = NCDatasets.@select(ds["SST"],\$lonr[1] <= lon <= \$lonr[2] && \$latr[1] <= lat <= \$latr[2]) - -# You can also select based on `ClosedInterval`s from `IntervalSets.jl`. -# Both 30..60 and 65 ± 25 construct `ClosedInterval`s, see their documentation for details. - -lon_interval = 30..60 -lat_interval = 65 ± 25 -v = NCDatasets.@select(ds["SST"], lon ∈ \$lon_interval && lat ∈ \$lat_interval) - -# get the indices matching the select query -(lon_indices,lat_indices,time_indices) = parentindices(v) - -# get longitude matchting the select query -v_lon = v["lon"] - -# find the nearest time instance -v = NCDatasets.@select(ds["SST"],time ≈ DateTime(2000,1,4)) - -# find the nearest time instance but not earlier or later than 2 hours -# an empty array is returned if no time instance is present - -v = NCDatasets.@select(ds["SST"],time ≈ DateTime(2000,1,3,1) ± Hour(2)) - -close(ds) -``` - -Any 1D variable with the same dimension name can be used in `@select`. For example, -if we have a time series of temperature and salinity, the temperature values -can also be selected based on salinity: - -```julia -# create a sample time series -using NCDatasets, Dates -fname = "sample_series.nc" -time = DateTime(2000,1,1):Day(1):DateTime(2009,12,31) -salinity = randn(length(time)) .+ 35 -temperature = randn(length(time)) - -NCDataset(fname,"c") do ds - defVar(ds,"time",time,("time",)); - defVar(ds,"salinity",salinity,("time",)); - defVar(ds,"temperature",temperature,("time",)); -end - -ds = NCDataset(fname) - -# load all temperature data from January where the salinity is larger than 35. -v = NCDatasets.@select(ds["temperature"],Dates.month(time) == 1 && salinity >= 35) - -# this is equivalent to -v2 = ds["temperature"][findall(Dates.month.(time) .== 1 .&& salinity .>= 35)] - -@test v == v2 -close(ds) -``` - -!!! note - - For optimal performance, one should try to load contiguous data ranges, in - particular when the data is loaded over HTTP/OPeNDAP. - -""" -macro select(v,expression) - expression_list = split_by_and(expression) - args = Vector{Any}(undef,length(expression_list)) - - # loop over all sub-expressions separated by && - for (i,e) in enumerate(expression_list) - (param,newsym),e = scan_coordinate_name(e) - - if (e.head == :call) && (e.args[1] == :≈) - @assert e.args[2] == newsym - target = e.args[3] - tolerance = nothing - - if (hasproperty(target,:head) && - (target.head == :call) && (target.args[1] == :±)) - value,tolerance = target.args[2:end] - else - value = target - end - - args[i] = quote - $(Meta.quot(param)) => Near($(esc(value)),$tolerance) - end - else - fun = Expr(:->,newsym,e) - args[i] = quote - $(Meta.quot(param)) => $(esc(fun)) - end - end - end - - return Expr(:call,:select,esc(v),args...) -end - - -function select(v,conditions...) - coord_names = coordinate_names(v) - if v isa AbstractArray - indices = Any[Colon() for _ in 1:ndims(v)] - else - indices = Dict{Symbol,Any}(((Symbol(d),Colon()) for d in dimnames(v))) - end - - for (param,condition) in conditions - coord,j = coordinate_value(v,param) - - if isa(condition,Near) - ind = _findfirst(condition,coord) - indices[j] = _intersect(indices[j],ind) - else - ind = findall(condition,coord) - indices[j] = _intersect(indices[j],ind) - end - end - - if v isa AbstractArray - view(v, indices...) - else - view(v; indices...) - end -end - -function coordinate_value(v::AbstractVariable,name_coord::Symbol) - ncv = dataset(v)[name_coord] - @assert ndims(ncv) == 1 - dimension_name = dimnames(ncv)[1] - i = findfirst(==(dimension_name),dimnames(v)) - fmtd(v) = join(dimnames(v),"×") - - if i == nothing - error("$name_coord (dimensions: $(fmtd(ncv))) and $(name(v)) (dimensions: $(fmtd(v))) do not share a named dimension") - end - return Array(ncv),i -end - - -function coordinate_names(v::AbstractVariable) - ds = dataset(v) - dimension_names = dimnames(v) - - return [Symbol(varname) for (varname,ncvar) in ds - if (ndims(ncvar) == 1) && dimnames(ncvar) ⊆ dimension_names] -end - - -function coordinate_value(ds::AbstractNCDataset,name_coord::Symbol) - ncv = ds[name_coord] - @assert ndims(ncv) == 1 - return Array(ncv),Symbol(dimnames(ncv)[1]) -end - -function coordinate_names(ds::AbstractNCDataset) - return [Symbol(varname) for (varname,ncvar) in ds - if (ndims(ncvar) == 1)] -end - - -# LocalWords: params vsubset conditionN NetCDF coord NCDatasets lon -# LocalWords: julia dereferenced parentindices fname nc DateTime ds -# LocalWords: randn NCDataset defVar lonr latr CFTime OPeNDAP args -# LocalWords: hasproperty esc Expr fmtd ncv diff --git a/src/subvariable.jl b/src/subvariable.jl index 2612f9e1..e69de29b 100644 --- a/src/subvariable.jl +++ b/src/subvariable.jl @@ -1,189 +0,0 @@ - -Base.parent(v::SubVariable) = v.parent -Base.parentindices(v::SubVariable) = v.indices -Base.size(v::SubVariable) = _shape_after_slice(size(v.parent),v.indices...) - -function dimnames(v::SubVariable) - dimension_names = dimnames(v.parent) - return dimension_names[map(i -> !(i isa Integer),collect(v.indices))] -end - -name(v::SubVariable) = name(v.parent) - -attribnames(v::SubVariable) = attribnames(v.parent) -attrib(v::SubVariable,name::SymbolOrString) = attrib(v.parent,name) -defAttrib(v::SubVariable,name::SymbolOrString,data) = defAttrib(v.parent,name,data) - -function SubVariable(A::AbstractVariable,indices...) - var = nothing - if hasproperty(A,:var) - if hasproperty(A.var,:attrib) - var = SubVariable(A.var,indices...) - end - end - T = eltype(A) - N = ndims(A) - return SubVariable{T,N,typeof(A),typeof(indices),typeof(A.attrib),typeof(var)}( - A,indices,A.attrib,var) -end - -SubVariable(A::AbstractVariable{T,N}) where T where N = SubVariable(A,ntuple(i -> :,N)...) - -# recursive calls so that the compiler can infer the types via inline-ing -# and constant propagation -_subsub(indices,i,l) = indices -_subsub(indices,i,l,ip,rest...) = _subsub((indices...,ip[i[l]]),i,l+1,rest...) -_subsub(indices,i,l,ip::Number,rest...) = _subsub((indices...,ip),i,l,rest...) -_subsub(indices,i,l,ip::Colon,rest...) = _subsub((indices...,i[l]),i,l+1,rest...) - -""" - j = subsub(parentindices,indices) - -Computed the tuple of indices `j` so that -`A[parentindices...][indices...] = A[j...]` for any array `A` and any tuple of -valid indices `parentindices` and `indices` -""" -subsub(parentindices,indices) = _subsub((),indices,1,parentindices...) - -materialize(v::SubVariable) = v.parent[v.indices...] - -""" -collect always returns an array. -Even if the result of the indexing is a scalar, it is wrapped -into a zero-dimensional array. -""" -function collect(v::SubVariable{T,N}) where T where N - if N == 0 - A = Array{T,0}(undef,()) - A[] = v.parent[v.indices...] - return A - else - return v.parent[v.indices...] - end -end - -Base.Array(v::SubVariable) = collect(v) - -function Base.view(v::SubVariable,indices::Union{Int,Colon,AbstractVector{Int}}...) - sub_indices = subsub(v.indices,indices) - SubVariable(parent(v),sub_indices...) -end - -""" - sv = view(v::NCDatasets.AbstractVariable,indices...) - -Returns a view of the variable `v` where indices are only lazily applied. -No data is actually copied or loaded. -Modifications to a view `sv`, also modifies the underlying array `v`. -All attributes of `v` are also present in `sv`. - -# Examples - -```julia -using NCDatasets -fname = tempname() -data = zeros(Int,10,11) -ds = NCDataset(fname,"c") -ncdata = defVar(ds,"temp",data,("lon","lat")) -ncdata_view = view(ncdata,2:3,2:4) -size(ncdata_view) -# output (2,3) -ncdata_view[1,1] = 1 -ncdata[2,2] -# outputs 1 as ncdata is also modified -close(ds) -``` - -""" -Base.view(v::Union{CFVariable, DeferVariable, MFCFVariable, MFVariable},indices::Union{Int,Colon,AbstractVector{Int}}...) = SubVariable(v,indices...) -Base.view(v::Variable,indices::Union{Int,Colon,AbstractVector{Int}}...) = SubVariable(v,indices...) -Base.view(v::SubVariable,indices::CartesianIndex) = view(v,indices.I...) -Base.view(v::SubVariable,indices::CartesianIndices) = view(v,indices.indices...) - -Base.getindex(v::SubVariable,indices::Union{Int,Colon,AbstractRange{<:Integer}}...) = materialize(view(v,indices...)) - -Base.getindex(v::SubVariable,indices::CartesianIndex) = getindex(v,indices.I...) -Base.getindex(v::SubVariable,indices::CartesianIndices) = - getindex(v,indices.indices...) - -function Base.setindex!(v::SubVariable,data,indices...) - sub_indices = subsub(v.indices,indices) - v.parent[sub_indices...] = data -end - -Base.setindex!(v::SubVariable,data,indices::CartesianIndex) = - setindex!(v,data,indices.I...) -Base.setindex!(v::SubVariable,data,indices::CartesianIndices) = - setindex!(v,data,indices.indices...) - - - -dimnames(ds::SubDataset) = dimnames(ds.ds) -defDim(ds::SubDataset,name::SymbolOrString,len) = defDim(ds.ds,name,len) - - -function dim(ds::SubDataset,dimname::SymbolOrString) - dn = Symbol(dimname) - if hasproperty(ds.indices,dn) - ind = getproperty(ds.indices,dn) - if ind == Colon() - return ds.ds.dim[dimname] - else - return length(ind) - end - else - return ds.ds.dim[dimname] - end -end - -unlimited(ds::SubDataset) = unlimited(ds.ds) - - -function SubDataset(ds::AbstractNCDataset,indices) - group = OrderedDict((n => SubDataset(g,indices) for (n,g) in ds.group)...) - SubDataset(ds,indices,ds.attrib,group) -end - -function Base.view(ds::AbstractNCDataset; indices...) - SubDataset(ds,values(indices)) -end - -function Base.getindex(ds::SubDataset,varname::Union{AbstractString, Symbol}) - ncvar = ds.ds[varname] - if ndims(ncvar) == 0 - return ncvar - end - - dims = dimnames(ncvar) - ind = ntuple(i -> get(ds.indices,Symbol(dims[i]),:),ndims(ncvar)) - return view(ncvar,ind...) -end - -function variable(ds::SubDataset,varname::Union{AbstractString, Symbol}) - ncvar = variable(ds.ds,varname) - if ndims(ncvar) == 0 - return ncvar - end - dims = dimnames(ncvar) - ind = ntuple(i -> get(ds.indices,Symbol(dims[i]),:),ndims(ncvar)) - return view(ncvar,ind...) -end - - -Base.keys(ds::SubDataset) = keys(ds.ds) -path(ds::SubDataset) = path(ds.ds) -groupname(ds::SubDataset) = groupname(ds.ds) - - -function dataset(v::SubVariable) - indices = (;((Symbol(d),i) for (d,i) in zip(dimnames(v),v.indices))...) - return SubDataset(dataset(v.parent),indices) -end - -function chunking(v::SubVariable) - storage, chunksizes = chunking(v.parent) - return storage, min.(chunksizes,collect(size(v))) -end - -deflate(v::SubVariable) = deflate(v.parent) -checksum(v::SubVariable) = checksum(v.parent) diff --git a/src/types.jl b/src/types.jl index fbb68e1d..29ef2012 100644 --- a/src/types.jl +++ b/src/types.jl @@ -68,68 +68,3 @@ end "Alias to `NCDataset`" const Dataset = NCDataset - -# Multi-file related type definitions - -mutable struct MFVariable{T,N,M,TA,TDS} <: AbstractNCVariable{T,N} - ds::TDS - var::CatArrays.CatArray{T,N,M,TA} - dimnames::NTuple{N,String} - varname::String -end - -mutable struct MFCFVariable{T,N,M,TA,TV,TDS} <: AbstractNCVariable{T,N} - ds::TDS - cfvar::CatArrays.CatArray{T,N,M,TA} - var::TV - dimnames::NTuple{N,String} - varname::String -end - -mutable struct MFDataset{T,N,S<:AbstractString} <: AbstractNCDataset where T <: AbstractNCDataset - ds::Array{T,N} - aggdim::S - isnewdim::Bool - constvars::Vector{Symbol} - _boundsmap::Dict{String,String} -end - -# DeferDataset are Dataset which are open only when there are accessed and -# closed directly after. This is necessary to work with a large number -# of NetCDF files (e.g. more than 1000). - -struct Resource - filename::String - mode::String - metadata::OrderedDict -end - -mutable struct DeferDataset <: AbstractNCDataset - r::Resource - groupname::String - data::OrderedDict - _boundsmap::Union{Nothing,Dict{String,String}} -end - -mutable struct DeferVariable{T,N} <: AbstractNCVariable{T,N} - r::Resource - varname::String - data::OrderedDict -end - -# view of subsets - -struct SubVariable{T,N,TA,TI,TAttrib,TV} <: AbstractNCVariable{T,N} - parent::TA - indices::TI - attrib::TAttrib - # unpacked variable - var::TV -end - -struct SubDataset{TD,TI,TA,TG} <: AbstractNCDataset - ds::TD - indices::TI - attrib::TA - group::TG -end diff --git a/src/variable.jl b/src/variable.jl index e613650c..a315b785 100644 --- a/src/variable.jl +++ b/src/variable.jl @@ -37,6 +37,7 @@ Return a tuple of integers with the size of the variable `var`. Base.size(v::Variable{T,N}) where {T,N} = ntuple(i -> nc_inq_dimlen(v.ds.ncid,v.dimids[i]),Val(N)) +Base.view(v::Variable,indices::Union{Int,Colon,AbstractVector{Int}}...) = SubVariable(v,indices...) """ renameVar(ds::NCDataset,oldname,newname) @@ -513,6 +514,3 @@ end return start,count,stride end - -Base.getindex(v::Union{MFVariable,DeferVariable},ci::CartesianIndices) = v[ci.indices...] -Base.setindex!(v::Union{MFVariable,DeferVariable},data,ci::CartesianIndices) = setindex!(v,data,ci.indices...)