diff --git a/ipld/car/.gitattributes b/ipld/car/.gitattributes new file mode 100644 index 0000000000..6f95229927 --- /dev/null +++ b/ipld/car/.gitattributes @@ -0,0 +1,2 @@ +# To prevent CRLF breakages on Windows for fragile files, like testdata. +* -text diff --git a/ipld/car/.github/workflows/go-fuzz.yml b/ipld/car/.github/workflows/go-fuzz.yml new file mode 100644 index 0000000000..830fc9ec29 --- /dev/null +++ b/ipld/car/.github/workflows/go-fuzz.yml @@ -0,0 +1,46 @@ +on: [ push, pull_request ] +name: Go Fuzz + +jobs: + v1: + strategy: + fail-fast: true + matrix: + target: [ "CarReader" ] + runs-on: ubuntu-latest + name: Fuzz V1 ${{ matrix.target }} + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions/setup-go@v2 + with: + go-version: 1.18.x + - name: Go information + run: | + go version + go env + - name: Run Fuzzing for 1m + run: go test -v -fuzz=Fuzz${{ matrix.target }} -fuzztime=1m . + v2: + strategy: + fail-fast: true + matrix: + target: [ "BlockReader", "Reader", "Inspect" ] + runs-on: ubuntu-latest + name: Fuzz V2 ${{ matrix.target }} + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - uses: actions/setup-go@v2 + with: + go-version: 1.18.x + - name: Go information + run: | + go version + go env + - name: Run Fuzzing for 1m + run: | + cd v2 + go test -v -fuzz=Fuzz${{ matrix.target }} -fuzztime=1m . diff --git a/ipld/car/.gitignore b/ipld/car/.gitignore new file mode 100644 index 0000000000..b3f7c18ae7 --- /dev/null +++ b/ipld/car/.gitignore @@ -0,0 +1,4 @@ +car/car +main +coverage.txt +dist/ diff --git a/ipld/car/LICENSE.md b/ipld/car/LICENSE.md new file mode 100644 index 0000000000..2fa16a1537 --- /dev/null +++ b/ipld/car/LICENSE.md @@ -0,0 +1,229 @@ +The contents of this repository are Copyright (c) corresponding authors and +contributors, licensed under the `Permissive License Stack` meaning either of: + +- Apache-2.0 Software License: https://www.apache.org/licenses/LICENSE-2.0 + ([...4tr2kfsq](https://dweb.link/ipfs/bafkreiankqxazcae4onkp436wag2lj3ccso4nawxqkkfckd6cg4tr2kfsq)) + +- MIT Software License: https://opensource.org/licenses/MIT + ([...vljevcba](https://dweb.link/ipfs/bafkreiepofszg4gfe2gzuhojmksgemsub2h4uy2gewdnr35kswvljevcba)) + +You may not use the contents of this repository except in compliance +with one of the listed Licenses. For an extended clarification of the +intent behind the choice of Licensing please refer to +https://protocol.ai/blog/announcing-the-permissive-license-stack/ + +Unless required by applicable law or agreed to in writing, software +distributed under the terms listed in this notice is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +either express or implied. See each License for the specific language +governing permissions and limitations under that License. + + +`SPDX-License-Identifier: Apache-2.0 OR MIT` + +Verbatim copies of both licenses are included below: + +
Apache-2.0 Software License + +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS +``` +
+ +
MIT Software License + +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` +
diff --git a/ipld/car/README.md b/ipld/car/README.md new file mode 100644 index 0000000000..af75fa1d73 --- /dev/null +++ b/ipld/car/README.md @@ -0,0 +1,71 @@ +go-car (go!) +================== + +[![](https://img.shields.io/badge/made%20by-Protocol%20Labs-blue.svg?style=flat-square)](https://protocol.ai) +[![](https://img.shields.io/badge/project-ipld-orange.svg?style=flat-square)](https://github.com/ipld/ipld) +[![](https://img.shields.io/badge/matrix-%23ipld-blue.svg?style=flat-square)](https://matrix.to/#/#ipld:ipfs.io) +[![Go Reference](https://pkg.go.dev/badge/github.com/ipld/go-car.svg)](https://pkg.go.dev/github.com/ipld/go-car) +[![Coverage Status](https://codecov.io/gh/ipld/go-car/branch/master/graph/badge.svg)](https://codecov.io/gh/ipld/go-car/branch/master) + +> Work with car (Content addressed ARchive) files! + +This is a Golang implementation of the [CAR specifications](https://ipld.io/specs/transport/car/), both [CARv1](https://ipld.io/specs/transport/car/carv1/) and [CARv2](https://ipld.io/specs/transport/car/carv2/). + +As a format, there are two major module versions: + +* [`go-car/v2`](v2/) is geared towards reading and writing CARv2 files, and also + supports consuming CARv1 files and using CAR files as an IPFS blockstore. +* `go-car`, in the root directory, only supports reading and writing CARv1 files. + +Most users should use v2, especially for new software, since the v2 API transparently supports both CAR formats. + +## Usage / Installation + +This repository provides a `car` binary that can be used for creating, extracting, and working with car files. + +To install the latest version of `car`, run: +```shell script +go install github.com/ipld/go-car/cmd/car@latest +``` + +More information about this binary is available in [`cmd/car`](cmd/car/) + + +## Features + +[CARv2](v2) features: +* [Generate index](https://pkg.go.dev/github.com/ipld/go-car/v2#GenerateIndex) from an existing CARv1 file +* [Wrap](https://pkg.go.dev/github.com/ipld/go-car/v2#WrapV1) CARv1 files into a CARv2 with automatic index generation. +* Random-access to blocks in a CAR file given their CID via [Read-Only blockstore](https://pkg.go.dev/github.com/ipld/go-car/v2/blockstore#NewReadOnly) API, with transparent support for both CARv1 and CARv2 +* Write CARv2 files via [Read-Write blockstore](https://pkg.go.dev/github.com/ipld/go-car/v2/blockstore#OpenReadWrite) API, with support for appending blocks to an existing CARv2 file, and resumption from a partially written CARv2 files. +* Individual access to [inner CARv1 data payload]((https://pkg.go.dev/github.com/ipld/go-car/v2#Reader.DataReader)) and [index]((https://pkg.go.dev/github.com/ipld/go-car/v2#Reader.IndexReader)) of a CARv2 file via the `Reader` API. + + +## API Documentation + +See docs on [pkg.go.dev](https://pkg.go.dev/github.com/ipld/go-car). + +## Examples + +Here is a shortlist of other examples from the documentation + +* [Wrap an existing CARv1 file into an indexed CARv2 file](https://pkg.go.dev/github.com/ipld/go-car/v2#example-WrapV1File) +* [Open read-only blockstore from a CAR file](https://pkg.go.dev/github.com/ipld/go-car/v2/blockstore#example-OpenReadOnly) +* [Open read-write blockstore from a CAR file](https://pkg.go.dev/github.com/ipld/go-car/v2/blockstore#example-OpenReadWrite) +* [Read the index from an existing CARv2 file](https://pkg.go.dev/github.com/ipld/go-car/v2/index#example-ReadFrom) +* [Extract the index from a CARv2 file and store it as a separate file](https://pkg.go.dev/github.com/ipld/go-car/v2/index#example-WriteTo) + +## Maintainers + +* [Masih Derkani](https://github.com/masih) +* [Will Scott](https://github.com/willscott) + +## Contribute + +PRs are welcome! + +When editing the Readme, please conform to the [standard-readme](https://github.com/RichardLitt/standard-readme) specification. + +## License + +Apache-2.0/MIT © Protocol Labs diff --git a/ipld/car/car.go b/ipld/car/car.go new file mode 100644 index 0000000000..026bbb7359 --- /dev/null +++ b/ipld/car/car.go @@ -0,0 +1,223 @@ +package car + +import ( + "bufio" + "context" + "fmt" + "io" + + cid "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + format "github.com/ipfs/go-ipld-format" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-merkledag" + + util "github.com/ipld/go-car/util" +) + +func init() { + cbor.RegisterCborType(CarHeader{}) +} + +type Store interface { + Put(context.Context, blocks.Block) error +} + +type ReadStore interface { + Get(context.Context, cid.Cid) (blocks.Block, error) +} + +type CarHeader struct { + Roots []cid.Cid + Version uint64 +} + +type carWriter struct { + ds format.NodeGetter + w io.Writer + walk WalkFunc +} + +type WalkFunc func(format.Node) ([]*format.Link, error) + +func WriteCar(ctx context.Context, ds format.NodeGetter, roots []cid.Cid, w io.Writer, options ...merkledag.WalkOption) error { + return WriteCarWithWalker(ctx, ds, roots, w, DefaultWalkFunc, options...) +} + +func WriteCarWithWalker(ctx context.Context, ds format.NodeGetter, roots []cid.Cid, w io.Writer, walk WalkFunc, options ...merkledag.WalkOption) error { + + h := &CarHeader{ + Roots: roots, + Version: 1, + } + + if err := WriteHeader(h, w); err != nil { + return fmt.Errorf("failed to write car header: %s", err) + } + + cw := &carWriter{ds: ds, w: w, walk: walk} + seen := cid.NewSet() + for _, r := range roots { + if err := merkledag.Walk(ctx, cw.enumGetLinks, r, seen.Visit, options...); err != nil { + return err + } + } + return nil +} + +func DefaultWalkFunc(nd format.Node) ([]*format.Link, error) { + return nd.Links(), nil +} + +func ReadHeader(br *bufio.Reader) (*CarHeader, error) { + hb, err := util.LdRead(br) + if err != nil { + return nil, err + } + + var ch CarHeader + if err := cbor.DecodeInto(hb, &ch); err != nil { + return nil, fmt.Errorf("invalid header: %v", err) + } + + return &ch, nil +} + +func WriteHeader(h *CarHeader, w io.Writer) error { + hb, err := cbor.DumpObject(h) + if err != nil { + return err + } + + return util.LdWrite(w, hb) +} + +func HeaderSize(h *CarHeader) (uint64, error) { + hb, err := cbor.DumpObject(h) + if err != nil { + return 0, err + } + + return util.LdSize(hb), nil +} + +func (cw *carWriter) enumGetLinks(ctx context.Context, c cid.Cid) ([]*format.Link, error) { + nd, err := cw.ds.Get(ctx, c) + if err != nil { + return nil, err + } + + if err := cw.writeNode(ctx, nd); err != nil { + return nil, err + } + + return cw.walk(nd) +} + +func (cw *carWriter) writeNode(ctx context.Context, nd format.Node) error { + return util.LdWrite(cw.w, nd.Cid().Bytes(), nd.RawData()) +} + +type CarReader struct { + br *bufio.Reader + Header *CarHeader +} + +func NewCarReader(r io.Reader) (*CarReader, error) { + br := bufio.NewReader(r) + ch, err := ReadHeader(br) + if err != nil { + return nil, err + } + + if ch.Version != 1 { + return nil, fmt.Errorf("invalid car version: %d", ch.Version) + } + + if len(ch.Roots) == 0 { + return nil, fmt.Errorf("empty car, no roots") + } + + return &CarReader{ + br: br, + Header: ch, + }, nil +} + +func (cr *CarReader) Next() (blocks.Block, error) { + c, data, err := util.ReadNode(cr.br) + if err != nil { + return nil, err + } + + hashed, err := c.Prefix().Sum(data) + if err != nil { + return nil, err + } + + if !hashed.Equals(c) { + return nil, fmt.Errorf("mismatch in content integrity, name: %s, data: %s", c, hashed) + } + + return blocks.NewBlockWithCid(data, c) +} + +type batchStore interface { + PutMany(context.Context, []blocks.Block) error +} + +func LoadCar(ctx context.Context, s Store, r io.Reader) (*CarHeader, error) { + cr, err := NewCarReader(r) + if err != nil { + return nil, err + } + + if bs, ok := s.(batchStore); ok { + return loadCarFast(ctx, bs, cr) + } + + return loadCarSlow(ctx, s, cr) +} + +func loadCarFast(ctx context.Context, s batchStore, cr *CarReader) (*CarHeader, error) { + var buf []blocks.Block + for { + blk, err := cr.Next() + if err != nil { + if err == io.EOF { + if len(buf) > 0 { + if err := s.PutMany(ctx, buf); err != nil { + return nil, err + } + } + return cr.Header, nil + } + return nil, err + } + + buf = append(buf, blk) + + if len(buf) > 1000 { + if err := s.PutMany(ctx, buf); err != nil { + return nil, err + } + buf = buf[:0] + } + } +} + +func loadCarSlow(ctx context.Context, s Store, cr *CarReader) (*CarHeader, error) { + for { + blk, err := cr.Next() + if err != nil { + if err == io.EOF { + return cr.Header, nil + } + return nil, err + } + + if err := s.Put(ctx, blk); err != nil { + return nil, err + } + } +} diff --git a/ipld/car/car_test.go b/ipld/car/car_test.go new file mode 100644 index 0000000000..3c6340be3e --- /dev/null +++ b/ipld/car/car_test.go @@ -0,0 +1,229 @@ +package car_test + +import ( + "bytes" + "context" + "encoding/hex" + "io" + "strings" + "testing" + + "github.com/ipfs/go-cid" + format "github.com/ipfs/go-ipld-format" + "github.com/ipfs/go-merkledag" + dstest "github.com/ipfs/go-merkledag/test" + car "github.com/ipld/go-car" +) + +func assertAddNodes(t *testing.T, ds format.DAGService, nds ...format.Node) { + for _, nd := range nds { + if err := ds.Add(context.Background(), nd); err != nil { + t.Fatal(err) + } + } +} + +func TestRoundtrip(t *testing.T) { + ctx := context.Background() + dserv := dstest.Mock() + a := merkledag.NewRawNode([]byte("aaaa")) + b := merkledag.NewRawNode([]byte("bbbb")) + c := merkledag.NewRawNode([]byte("cccc")) + + nd1 := &merkledag.ProtoNode{} + nd1.AddNodeLink("cat", a) + + nd2 := &merkledag.ProtoNode{} + nd2.AddNodeLink("first", nd1) + nd2.AddNodeLink("dog", b) + + nd3 := &merkledag.ProtoNode{} + nd3.AddNodeLink("second", nd2) + nd3.AddNodeLink("bear", c) + + assertAddNodes(t, dserv, a, b, c, nd1, nd2, nd3) + + buf := new(bytes.Buffer) + if err := car.WriteCar(context.Background(), dserv, []cid.Cid{nd3.Cid()}, buf); err != nil { + t.Fatal(err) + } + + bserv := dstest.Bserv() + ch, err := car.LoadCar(ctx, bserv.Blockstore(), buf) + if err != nil { + t.Fatal(err) + } + + if len(ch.Roots) != 1 { + t.Fatal("should have one root") + } + + if !ch.Roots[0].Equals(nd3.Cid()) { + t.Fatal("got wrong cid") + } + + bs := bserv.Blockstore() + for _, nd := range []format.Node{a, b, c, nd1, nd2, nd3} { + has, err := bs.Has(ctx, nd.Cid()) + if err != nil { + t.Fatal(err) + } + + if !has { + t.Fatal("should have cid in blockstore") + } + } +} + +// fixture is a clean single-block, single-root CAR +const fixtureStr = "3aa265726f6f747381d82a58250001711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e012c01711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80ba165646f646779f5" + +func TestEOFHandling(t *testing.T) { + fixture, err := hex.DecodeString(fixtureStr) + if err != nil { + t.Fatal(err) + } + + load := func(t *testing.T, byts []byte) *car.CarReader { + cr, err := car.NewCarReader(bytes.NewReader(byts)) + if err != nil { + t.Fatal(err) + } + + blk, err := cr.Next() + if err != nil { + t.Fatal(err) + } + if blk.Cid().String() != "bafyreiavd7u6opdcm6tqmddpnrgmvfb4enxuwglhenejmchnwqvixd5ibm" { + t.Fatal("unexpected CID") + } + + return cr + } + + t.Run("CleanEOF", func(t *testing.T) { + cr := load(t, fixture) + + blk, err := cr.Next() + if err != io.EOF { + t.Fatal("Didn't get expected EOF") + } + if blk != nil { + t.Fatal("EOF returned expected block") + } + }) + + t.Run("BadVarint", func(t *testing.T) { + fixtureBadVarint := append(fixture, 160) + cr := load(t, fixtureBadVarint) + + blk, err := cr.Next() + if err != io.ErrUnexpectedEOF { + t.Fatal("Didn't get unexpected EOF") + } + if blk != nil { + t.Fatal("EOF returned unexpected block") + } + }) + + t.Run("TruncatedBlock", func(t *testing.T) { + fixtureTruncatedBlock := append(fixture, 100, 0, 0) + cr := load(t, fixtureTruncatedBlock) + + blk, err := cr.Next() + if err != io.ErrUnexpectedEOF { + t.Fatal("Didn't get unexpected EOF") + } + if blk != nil { + t.Fatal("EOF returned unexpected block") + } + }) +} + +func TestBadHeaders(t *testing.T) { + testCases := []struct { + name string + hex string + errStr string // either the whole error string + errPfx string // or just the prefix + }{ + { + "{version:2}", + "0aa16776657273696f6e02", + "invalid car version: 2", + "", + }, + { + // an unfortunate error because we don't use a pointer + "{roots:[baeaaaa3bmjrq]}", + "13a165726f6f747381d82a480001000003616263", + "invalid car version: 0", + "", + }, { + "{version:\"1\",roots:[baeaaaa3bmjrq]}", + "1da265726f6f747381d82a4800010000036162636776657273696f6e6131", + "", "invalid header: ", + }, { + "{version:1}", + "0aa16776657273696f6e01", + "empty car, no roots", + "", + }, { + "{version:1,roots:{cid:baeaaaa3bmjrq}}", + "20a265726f6f7473a163636964d82a4800010000036162636776657273696f6e01", + "", + "invalid header: ", + }, { + "{version:1,roots:[baeaaaa3bmjrq],blip:true}", + "22a364626c6970f565726f6f747381d82a4800010000036162636776657273696f6e01", + "", + "invalid header: ", + }, { + "[1,[]]", + "03820180", + "", + "invalid header: ", + }, { + // this is an unfortunate error, it'd be nice to catch it better but it's + // very unlikely we'd ever see this in practice + "null", + "01f6", + "", + "invalid car version: 0", + }, + } + + makeCar := func(t *testing.T, byts string) error { + fixture, err := hex.DecodeString(byts) + if err != nil { + t.Fatal(err) + } + _, err = car.NewCarReader(bytes.NewReader(fixture)) + return err + } + + t.Run("Sanity check {version:1,roots:[baeaaaa3bmjrq]}", func(t *testing.T) { + err := makeCar(t, "1ca265726f6f747381d82a4800010000036162636776657273696f6e01") + if err != nil { + t.Fatal(err) + } + }) + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + err := makeCar(t, tc.hex) + if err == nil { + t.Fatal("expected error from bad header, didn't get one") + } + if tc.errStr != "" { + if err.Error() != tc.errStr { + t.Fatalf("bad error: %v", err) + } + } else { + if !strings.HasPrefix(err.Error(), tc.errPfx) { + t.Fatalf("bad error: %v", err) + } + } + }) + } +} diff --git a/ipld/car/cmd/car/README.md b/ipld/car/cmd/car/README.md new file mode 100644 index 0000000000..995850fd73 --- /dev/null +++ b/ipld/car/cmd/car/README.md @@ -0,0 +1,38 @@ +car - The CLI tool +================== + +[![](https://img.shields.io/badge/made%20by-Protocol%20Labs-blue.svg?style=flat-square)](https://protocol.ai) +[![](https://img.shields.io/badge/project-ipld-orange.svg?style=flat-square)](https://github.com/ipld/ipld) +[![](https://img.shields.io/badge/matrix-%23ipld-blue.svg?style=flat-square)](https://matrix.to/#/#ipld:ipfs.io) + +> A CLI for interacting with car files + +## Usage + +``` +USAGE: + car [global options] command [command options] [arguments...] + +COMMANDS: + compile compile a car file from a debug patch + create, c Create a car file + debug debug a car file + detach-index Detach an index to a detached file + extract, x Extract the contents of a car when the car encodes UnixFS data + filter, f Filter the CIDs in a car + get-block, gb Get a block out of a car + get-dag, gd Get a dag out of a car + index, i write out the car with an index + inspect verifies a car and prints a basic report about its contents + list, l, ls List the CIDs in a car + root Get the root CID of a car + verify, v Verify a CAR is wellformed + help, h Shows a list of commands or help for one command +``` + +## Install + +To install the latest version of `car`, run: +```shell script +go install github.com/ipld/go-car/cmd/car@latest +``` diff --git a/ipld/car/cmd/car/car.go b/ipld/car/cmd/car/car.go new file mode 100644 index 0000000000..d2356484cc --- /dev/null +++ b/ipld/car/cmd/car/car.go @@ -0,0 +1,218 @@ +package main + +import ( + "log" + "os" + + "github.com/multiformats/go-multicodec" + "github.com/urfave/cli/v2" +) + +func main() { os.Exit(main1()) } + +func main1() int { + app := &cli.App{ + Name: "car", + Usage: "Utility for working with car files", + Commands: []*cli.Command{ + { + Name: "compile", + Usage: "compile a car file from a debug patch", + Action: CompileCar, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "output", + Aliases: []string{"o", "f"}, + Usage: "The file to write to", + TakesFile: true, + }, + }, + }, + { + Name: "create", + Usage: "Create a car file", + Aliases: []string{"c"}, + Action: CreateCar, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "file", + Aliases: []string{"f", "output", "o"}, + Usage: "The car file to write to", + TakesFile: true, + }, + &cli.IntFlag{ + Name: "version", + Value: 2, + Usage: "Write output as a v1 or v2 format car", + }, + }, + }, + { + Name: "debug", + Usage: "debug a car file", + Action: DebugCar, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "output", + Aliases: []string{"o", "f"}, + Usage: "The file to write to", + TakesFile: true, + }, + }, + }, + { + Name: "detach-index", + Usage: "Detach an index to a detached file", + Action: DetachCar, + Subcommands: []*cli.Command{{ + Name: "list", + Usage: "List a detached index", + Action: DetachCarList, + }}, + }, + { + Name: "extract", + Aliases: []string{"x"}, + Usage: "Extract the contents of a car when the car encodes UnixFS data", + Action: ExtractCar, + ArgsUsage: "[output directory|-]", + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "file", + Aliases: []string{"f"}, + Usage: "The car file to extract from, or stdin if omitted", + Required: false, + TakesFile: true, + }, + &cli.StringFlag{ + Name: "path", + Aliases: []string{"p"}, + Usage: "The unixfs path to extract", + Required: false, + }, + &cli.BoolFlag{ + Name: "verbose", + Aliases: []string{"v"}, + Usage: "Include verbose information about extracted contents", + }, + }, + }, + { + Name: "filter", + Aliases: []string{"f"}, + Usage: "Filter the CIDs in a car", + Action: FilterCar, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "cid-file", + Usage: "A file to read CIDs from", + TakesFile: true, + }, + &cli.BoolFlag{ + Name: "append", + Usage: "Append cids to an existing output file", + }, + }, + }, + { + Name: "get-block", + Aliases: []string{"gb"}, + Usage: "Get a block out of a car", + Action: GetCarBlock, + }, + { + Name: "get-dag", + Aliases: []string{"gd"}, + Usage: "Get a dag out of a car", + Action: GetCarDag, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "selector", + Aliases: []string{"s"}, + Usage: "A selector over the dag", + }, + &cli.BoolFlag{ + Name: "strict", + Usage: "Fail if the selector finds links to blocks not in the original car", + }, + &cli.IntFlag{ + Name: "version", + Value: 2, + Usage: "Write output as a v1 or v2 format car", + }, + }, + }, + { + Name: "index", + Aliases: []string{"i"}, + Usage: "write out the car with an index", + Action: IndexCar, + Flags: []cli.Flag{ + &cli.StringFlag{ + Name: "codec", + Aliases: []string{"c"}, + Usage: "The type of index to write", + Value: multicodec.CarMultihashIndexSorted.String(), + }, + &cli.IntFlag{ + Name: "version", + Value: 2, + Usage: "Write output as a v1 or v2 format car", + }, + }, + Subcommands: []*cli.Command{{ + Name: "create", + Usage: "Write out a detached index", + Action: CreateIndex, + }}, + }, + { + Name: "inspect", + Usage: "verifies a car and prints a basic report about its contents", + Action: InspectCar, + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "full", + Value: false, + Usage: "Check that the block data hash digests match the CIDs", + }, + }, + }, + { + Name: "list", + Aliases: []string{"l", "ls"}, + Usage: "List the CIDs in a car", + Action: ListCar, + Flags: []cli.Flag{ + &cli.BoolFlag{ + Name: "verbose", + Aliases: []string{"v"}, + Usage: "Include verbose information about contained blocks", + }, + &cli.BoolFlag{ + Name: "unixfs", + Usage: "List unixfs filesystem from the root of the car", + }, + }, + }, + { + Name: "root", + Usage: "Get the root CID of a car", + Action: CarRoot, + }, + { + Name: "verify", + Aliases: []string{"v"}, + Usage: "Verify a CAR is wellformed", + Action: VerifyCar, + }, + }, + } + + err := app.Run(os.Args) + if err != nil { + log.Println(err) + return 1 + } + return 0 +} diff --git a/ipld/car/cmd/car/compile.go b/ipld/car/cmd/car/compile.go new file mode 100644 index 0000000000..f6a1b49791 --- /dev/null +++ b/ipld/car/cmd/car/compile.go @@ -0,0 +1,463 @@ +package main + +import ( + "bufio" + "bytes" + "context" + "fmt" + "io" + "os" + "regexp" + "strings" + "unicode/utf8" + + "github.com/ipfs/go-cid" + blocks "github.com/ipfs/go-libipfs/blocks" + carv1 "github.com/ipld/go-car" + "github.com/ipld/go-car/util" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/codec" + "github.com/ipld/go-ipld-prime/codec/dagjson" + "github.com/ipld/go-ipld-prime/datamodel" + "github.com/ipld/go-ipld-prime/linking" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/ipld/go-ipld-prime/node/basicnode" + "github.com/ipld/go-ipld-prime/storage/memstore" + "github.com/polydawn/refmt/json" + "github.com/urfave/cli/v2" + "golang.org/x/exp/slices" +) + +var ( + plusLineRegex = regexp.MustCompile(`^\+\+\+ ([\w-]+) ([\S]+ )?([\w]+)$`) +) + +// Compile is a command to translate between a human-debuggable patch-like format and a car file. +func CompileCar(c *cli.Context) error { + var err error + inStream := os.Stdin + if c.Args().Len() >= 1 { + inStream, err = os.Open(c.Args().First()) + if err != nil { + return err + } + } + + //parse headers. + br := bufio.NewReader(inStream) + header, _, err := br.ReadLine() + if err != nil { + return err + } + + v2 := strings.HasPrefix(string(header), "car compile --v2 ") + rest := strings.TrimPrefix(string(header), "car compile ") + if v2 { + rest = strings.TrimPrefix(rest, "--v2 ") + } + carName := strings.TrimSpace(rest) + + roots := make([]cid.Cid, 0) + for { + peek, err := br.Peek(4) + if err == io.EOF { + break + } else if err != nil { + return err + } + if bytes.Equal(peek, []byte("--- ")) { + break + } + rootLine, _, err := br.ReadLine() + if err != nil { + return err + } + if strings.HasPrefix(string(rootLine), "root ") { + var rCidS string + fmt.Sscanf(string(rootLine), "root %s", &rCidS) + rCid, err := cid.Parse(rCidS) + if err != nil { + return err + } + roots = append(roots, rCid) + } + } + + //parse blocks. + cidList := make([]cid.Cid, 0) + rawBlocks := make(map[cid.Cid][]byte) + rawCodecs := make(map[cid.Cid]string) + + for { + nextCid, mode, nextBlk, err := parsePatch(br) + if err == io.EOF { + break + } else if err != nil { + return err + } + rawBlocks[nextCid] = nextBlk + rawCodecs[nextCid] = mode + cidList = append(cidList, nextCid) + } + + // Re-create the original IPLD encoded blocks, but allowing for modifications of the + // patch data which may generate new CIDs; so we track the DAG relationships and + // rewrite CIDs in other referring where they get updated. + + // structure as a tree + childMap := make(map[cid.Cid][]cid.Cid) + for c := range rawBlocks { + if _, ok := childMap[c]; !ok { + childMap[c] = make([]cid.Cid, 0) + } + for d, blk := range rawBlocks { + if c.Equals(d) { + continue + } + if strings.Contains(string(blk), c.String()) { + if _, ok := childMap[d]; !ok { + childMap[d] = make([]cid.Cid, 0) + } + childMap[d] = append(childMap[d], c) + } else if strings.Contains(string(blk), string(c.Bytes())) { + if _, ok := childMap[d]; !ok { + childMap[d] = make([]cid.Cid, 0) + } + childMap[d] = append(childMap[d], c) + } + } + } + + // re-parse/re-build CIDs + outBlocks := make(map[cid.Cid][]byte) + for len(childMap) > 0 { + for origCid, kids := range childMap { + if len(kids) == 0 { + // compile to final cid + blk := rawBlocks[origCid] + finalCid, finalBlk, err := serializeBlock(c.Context, origCid.Prefix(), rawCodecs[origCid], blk) + if err != nil { + return err + } + outBlocks[finalCid] = finalBlk + idx := slices.Index(cidList, origCid) + cidList[idx] = finalCid + + // update other remaining nodes of the new cid. + for otherCid, otherKids := range childMap { + for i, otherKid := range otherKids { + if otherKid.Equals(origCid) { + if !finalCid.Equals(origCid) { + // update block + rawBlocks[otherCid] = bytes.ReplaceAll(rawBlocks[otherCid], origCid.Bytes(), finalCid.Bytes()) + rawBlocks[otherCid] = bytes.ReplaceAll(rawBlocks[otherCid], []byte(origCid.String()), []byte(finalCid.String())) + } + // remove from childMap + nok := append(otherKids[0:i], otherKids[i+1:]...) + childMap[otherCid] = nok + break // to next child map entry. + } + } + } + + delete(childMap, origCid) + } + } + } + + if !v2 { + // write output + outStream := os.Stdout + if c.IsSet("output") { + outFileName := c.String("output") + if outFileName == "" { + outFileName = carName + } + outFile, err := os.Create(outFileName) + if err != nil { + return err + } + defer outFile.Close() + outStream = outFile + } + + if err := carv1.WriteHeader(&carv1.CarHeader{ + Roots: roots, + Version: 1, + }, outStream); err != nil { + return err + } + for c, blk := range outBlocks { + if err := util.LdWrite(outStream, c.Bytes(), blk); err != nil { + return err + } + } + } else { + outFileName := c.String("output") + if outFileName == "" { + outFileName = carName + } + + if outFileName == "-" && !c.IsSet("output") { + return fmt.Errorf("cannot stream carv2's to stdout") + } + bs, err := blockstore.OpenReadWrite(outFileName, roots) + if err != nil { + return err + } + for _, bc := range cidList { + blk := outBlocks[bc] + ob, _ := blocks.NewBlockWithCid(blk, bc) + bs.Put(c.Context, ob) + } + return bs.Finalize() + } + + return nil +} + +func serializeBlock(ctx context.Context, codec cid.Prefix, encoding string, raw []byte) (cid.Cid, []byte, error) { + ls := cidlink.DefaultLinkSystem() + store := memstore.Store{Bag: map[string][]byte{}} + ls.SetReadStorage(&store) + ls.SetWriteStorage(&store) + b := basicnode.Prototype.Any.NewBuilder() + if encoding == "dag-json" { + if err := dagjson.Decode(b, bytes.NewBuffer(raw)); err != nil { + return cid.Undef, nil, err + } + } else if encoding == "raw" { + if err := b.AssignBytes(raw); err != nil { + return cid.Undef, nil, err + } + } else { + return cid.Undef, nil, fmt.Errorf("unknown encoding: %s", encoding) + } + lnk, err := ls.Store(linking.LinkContext{Ctx: ctx}, cidlink.LinkPrototype{Prefix: codec}, b.Build()) + if err != nil { + return cid.Undef, nil, err + } + outCid := lnk.(cidlink.Link).Cid + outBytes, outErr := store.Get(ctx, outCid.KeyString()) + return outCid, outBytes, outErr +} + +// DebugCar is a command to translate between a car file, and a human-debuggable patch-like format. +func DebugCar(c *cli.Context) error { + var err error + inStream := os.Stdin + inFile := "-" + if c.Args().Len() >= 1 { + inFile = c.Args().First() + inStream, err = os.Open(inFile) + if err != nil { + return err + } + } + + rd, err := carv2.NewBlockReader(inStream) + if err != nil { + return err + } + + // patch the header. + outStream := os.Stdout + if c.IsSet("output") { + outFileName := c.String("output") + outFile, err := os.Create(outFileName) + if err != nil { + return err + } + defer outFile.Close() + outStream = outFile + } + + outStream.WriteString("car compile ") + if rd.Version == 2 { + outStream.WriteString("--v2 ") + } + + outStream.WriteString(inFile + "\n") + for _, rt := range rd.Roots { + fmt.Fprintf(outStream, "root %s\n", rt.String()) + } + + // patch each block. + nxt, err := rd.Next() + if err != nil { + return err + } + for nxt != nil { + chunk, err := patch(c.Context, nxt.Cid(), nxt.RawData()) + if err != nil { + return err + } + outStream.Write(chunk) + + nxt, err = rd.Next() + if err == io.EOF { + return nil + } + } + + return nil +} + +func patch(ctx context.Context, c cid.Cid, blk []byte) ([]byte, error) { + ls := cidlink.DefaultLinkSystem() + store := memstore.Store{Bag: map[string][]byte{}} + ls.SetReadStorage(&store) + ls.SetWriteStorage(&store) + store.Put(ctx, c.KeyString(), blk) + node, err := ls.Load(linking.LinkContext{Ctx: ctx}, cidlink.Link{Cid: c}, basicnode.Prototype.Any) + if err != nil { + return nil, fmt.Errorf("could not load block: %q", err) + } + + outMode := "dag-json" + if node.Kind() == datamodel.Kind_Bytes && isPrintable(node) { + outMode = "raw" + } + finalBuf := bytes.NewBuffer(nil) + + if outMode == "dag-json" { + opts := dagjson.EncodeOptions{ + EncodeLinks: true, + EncodeBytes: true, + MapSortMode: codec.MapSortMode_Lexical, + } + if err := dagjson.Marshal(node, json.NewEncoder(finalBuf, json.EncodeOptions{Line: []byte{'\n'}, Indent: []byte{'\t'}}), opts); err != nil { + return nil, err + } + } else if outMode == "raw" { + nb, err := node.AsBytes() + if err != nil { + return nil, err + } + finalBuf.Write(nb) + } + + // figure out number of lines. + lcnt := strings.Count(finalBuf.String(), "\n") + crStr := " (no-end-cr)" + if finalBuf.Bytes()[len(finalBuf.Bytes())-1] == '\n' { + crStr = "" + } + + outBuf := bytes.NewBuffer(nil) + outBuf.WriteString("--- " + c.String() + "\n") + outBuf.WriteString("+++ " + outMode + crStr + " " + c.String() + "\n") + outBuf.WriteString(fmt.Sprintf("@@ -%d,%d +%d,%d @@\n", 0, lcnt, 0, lcnt)) + outBuf.Write(finalBuf.Bytes()) + outBuf.WriteString("\n") + return outBuf.Bytes(), nil +} + +func isPrintable(n ipld.Node) bool { + b, err := n.AsBytes() + if err != nil { + return false + } + if !utf8.Valid(b) { + return false + } + if bytes.ContainsAny(b, string([]byte{0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x10, 0x11, 0x12, 0x13, 0x14, 0x16, 0x17, 0x18, 0x19, 0x1c, 0x1d, 0x1e, 0x1f})) { + return false + } + // check if would confuse the 'end of patch' checker. + if bytes.Contains(b, []byte("\n--- ")) { + return false + } + return true +} + +func parsePatch(br *bufio.Reader) (cid.Cid, string, []byte, error) { + // read initial line to parse CID. + l1, isPrefix, err := br.ReadLine() + if err != nil { + return cid.Undef, "", nil, err + } + if isPrefix { + return cid.Undef, "", nil, fmt.Errorf("unexpected long header l1") + } + var cs string + if _, err := fmt.Sscanf(string(l1), "--- %s", &cs); err != nil { + return cid.Undef, "", nil, fmt.Errorf("could not parse patch cid line (%s): %q", l1, err) + } + l2, isPrefix, err := br.ReadLine() + if err != nil { + return cid.Undef, "", nil, err + } + if isPrefix { + return cid.Undef, "", nil, fmt.Errorf("unexpected long header l2") + } + var mode string + var noEndReturn bool + matches := plusLineRegex.FindSubmatch(l2) + if len(matches) >= 2 { + mode = string(matches[1]) + } + if len(matches) < 2 || string(matches[len(matches)-1]) != cs { + return cid.Undef, "", nil, fmt.Errorf("mismatched cid lines: %v", string(l2)) + } + if len(matches[2]) > 0 { + noEndReturn = (string(matches[2]) == "(no-end-cr) ") + } + c, err := cid.Parse(cs) + if err != nil { + return cid.Undef, "", nil, err + } + + // skip over @@ line. + l3, isPrefix, err := br.ReadLine() + if err != nil { + return cid.Undef, "", nil, err + } + if isPrefix { + return cid.Undef, "", nil, fmt.Errorf("unexpected long header l3") + } + if !strings.HasPrefix(string(l3), "@@") { + return cid.Undef, "", nil, fmt.Errorf("unexpected missing chunk prefix") + } + + // keep going until next chunk or end. + outBuf := bytes.NewBuffer(nil) + for { + peek, err := br.Peek(4) + if err != nil && err != io.EOF { + return cid.Undef, "", nil, err + } + if bytes.Equal(peek, []byte("--- ")) { + break + } + // accumulate to buffer. + l, err := br.ReadBytes('\n') + if l != nil { + outBuf.Write(l) + } + if err == io.EOF { + break + } else if err != nil { + return cid.Undef, "", nil, err + } + } + + ob := outBuf.Bytes() + + // remove the final line return + if len(ob) > 2 && bytes.Equal(ob[len(ob)-2:], []byte("\r\n")) { + ob = ob[:len(ob)-2] + } else if len(ob) > 1 && bytes.Equal(ob[len(ob)-1:], []byte("\n")) { + ob = ob[:len(ob)-1] + } + + if noEndReturn && len(ob) > 2 && bytes.Equal(ob[len(ob)-2:], []byte("\r\n")) { + ob = ob[:len(ob)-2] + } else if noEndReturn && len(ob) > 1 && bytes.Equal(ob[len(ob)-1:], []byte("\n")) { + ob = ob[:len(ob)-1] + } + + return c, mode, ob, nil +} diff --git a/ipld/car/cmd/car/create.go b/ipld/car/cmd/car/create.go new file mode 100644 index 0000000000..7b50b6458e --- /dev/null +++ b/ipld/car/cmd/car/create.go @@ -0,0 +1,130 @@ +package main + +import ( + "bytes" + "context" + "fmt" + "io" + "path" + + "github.com/ipfs/go-cid" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-unixfsnode/data/builder" + "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/urfave/cli/v2" +) + +// CreateCar creates a car +func CreateCar(c *cli.Context) error { + var err error + if c.Args().Len() == 0 { + return fmt.Errorf("a source location to build the car from must be specified") + } + + if !c.IsSet("file") { + return fmt.Errorf("a file destination must be specified") + } + + // make a cid with the right length that we eventually will patch with the root. + hasher, err := multihash.GetHasher(multihash.SHA2_256) + if err != nil { + return err + } + digest := hasher.Sum([]byte{}) + hash, err := multihash.Encode(digest, multihash.SHA2_256) + if err != nil { + return err + } + proxyRoot := cid.NewCidV1(uint64(multicodec.DagPb), hash) + + options := []car.Option{} + switch c.Int("version") { + case 1: + options = []car.Option{blockstore.WriteAsCarV1(true)} + case 2: + // already the default + default: + return fmt.Errorf("invalid CAR version %d", c.Int("version")) + } + + cdest, err := blockstore.OpenReadWrite(c.String("file"), []cid.Cid{proxyRoot}, options...) + if err != nil { + return err + } + + // Write the unixfs blocks into the store. + root, err := writeFiles(c.Context, cdest, c.Args().Slice()...) + if err != nil { + return err + } + + if err := cdest.Finalize(); err != nil { + return err + } + // re-open/finalize with the final root. + return car.ReplaceRootsInFile(c.String("file"), []cid.Cid{root}) +} + +func writeFiles(ctx context.Context, bs *blockstore.ReadWrite, paths ...string) (cid.Cid, error) { + ls := cidlink.DefaultLinkSystem() + ls.TrustedStorage = true + ls.StorageReadOpener = func(_ ipld.LinkContext, l ipld.Link) (io.Reader, error) { + cl, ok := l.(cidlink.Link) + if !ok { + return nil, fmt.Errorf("not a cidlink") + } + blk, err := bs.Get(ctx, cl.Cid) + if err != nil { + return nil, err + } + return bytes.NewBuffer(blk.RawData()), nil + } + ls.StorageWriteOpener = func(_ ipld.LinkContext) (io.Writer, ipld.BlockWriteCommitter, error) { + buf := bytes.NewBuffer(nil) + return buf, func(l ipld.Link) error { + cl, ok := l.(cidlink.Link) + if !ok { + return fmt.Errorf("not a cidlink") + } + blk, err := blocks.NewBlockWithCid(buf.Bytes(), cl.Cid) + if err != nil { + return err + } + bs.Put(ctx, blk) + return nil + }, nil + } + + topLevel := make([]dagpb.PBLink, 0, len(paths)) + for _, p := range paths { + l, size, err := builder.BuildUnixFSRecursive(p, &ls) + if err != nil { + return cid.Undef, err + } + name := path.Base(p) + entry, err := builder.BuildUnixFSDirectoryEntry(name, int64(size), l) + if err != nil { + return cid.Undef, err + } + topLevel = append(topLevel, entry) + } + + // make a directory for the file(s). + + root, _, err := builder.BuildUnixFSDirectory(topLevel, &ls) + if err != nil { + return cid.Undef, nil + } + rcl, ok := root.(cidlink.Link) + if !ok { + return cid.Undef, fmt.Errorf("could not interpret %s", root) + } + + return rcl.Cid, nil +} diff --git a/ipld/car/cmd/car/detach.go b/ipld/car/cmd/car/detach.go new file mode 100644 index 0000000000..e04eba9dd5 --- /dev/null +++ b/ipld/car/cmd/car/detach.go @@ -0,0 +1,73 @@ +package main + +import ( + "fmt" + "io" + "os" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/multiformats/go-multihash" + "github.com/urfave/cli/v2" +) + +// DetachCar is a command to output the index part of a car. +func DetachCar(c *cli.Context) error { + r, err := carv2.OpenReader(c.Args().Get(0)) + if err != nil { + return err + } + defer r.Close() + + if !r.Header.HasIndex() { + return fmt.Errorf("no index present") + } + + outStream := os.Stdout + if c.Args().Len() >= 2 { + outStream, err = os.Create(c.Args().Get(1)) + if err != nil { + return err + } + } + defer outStream.Close() + + ir, err := r.IndexReader() + if err != nil { + return err + } + _, err = io.Copy(outStream, ir) + return err +} + +// DetachCarList prints a list of what's found in a detached index. +func DetachCarList(c *cli.Context) error { + var err error + + inStream := os.Stdin + if c.Args().Len() >= 1 { + inStream, err = os.Open(c.Args().First()) + if err != nil { + return err + } + defer inStream.Close() + } + + idx, err := index.ReadFrom(inStream) + if err != nil { + return err + } + + if iidx, ok := idx.(index.IterableIndex); ok { + err := iidx.ForEach(func(mh multihash.Multihash, offset uint64) error { + fmt.Printf("%s %d\n", mh, offset) + return nil + }) + if err != nil { + return err + } + return nil + } + + return fmt.Errorf("index of codec %s is not iterable", idx.Codec()) +} diff --git a/ipld/car/cmd/car/extract.go b/ipld/car/cmd/car/extract.go new file mode 100644 index 0000000000..f9373fd37b --- /dev/null +++ b/ipld/car/cmd/car/extract.go @@ -0,0 +1,443 @@ +package main + +import ( + "context" + "errors" + "fmt" + "io" + "os" + "path" + "path/filepath" + "runtime" + "strings" + "sync" + + "github.com/ipfs/go-cid" + "github.com/ipfs/go-unixfsnode" + "github.com/ipfs/go-unixfsnode/data" + "github.com/ipfs/go-unixfsnode/file" + "github.com/ipld/go-car/v2" + carstorage "github.com/ipld/go-car/v2/storage" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/storage" + "github.com/urfave/cli/v2" +) + +var ErrNotDir = fmt.Errorf("not a directory") + +// ExtractCar pulls files and directories out of a car +func ExtractCar(c *cli.Context) error { + outputDir, err := os.Getwd() + if err != nil { + return err + } + if c.Args().Present() { + outputDir = c.Args().First() + } + + var store storage.ReadableStorage + var roots []cid.Cid + + if c.String("file") == "" { + if f, ok := c.App.Reader.(*os.File); ok { + stat, err := f.Stat() + if err != nil { + return err + } + if (stat.Mode() & os.ModeCharDevice) != 0 { + // Is a terminal. In reality the user is unlikely to actually paste + // CAR data into this terminal, but this message may serve to make + // them aware that they can/should pipe data into this command. + stopKeys := "Ctrl+D" + if runtime.GOOS == "windows" { + stopKeys = "Ctrl+Z, Enter" + } + fmt.Fprintf(c.App.ErrWriter, "Reading from stdin; use %s to end\n", stopKeys) + } + } + var err error + store, roots, err = NewStdinReadStorage(c.App.Reader) + if err != nil { + return err + } + } else { + carFile, err := os.Open(c.String("file")) + if err != nil { + return err + } + store, err = carstorage.OpenReadable(carFile) + if err != nil { + return err + } + roots = store.(carstorage.ReadableCar).Roots() + } + + ls := cidlink.DefaultLinkSystem() + ls.TrustedStorage = true + ls.SetReadStorage(store) + + path, err := pathSegments(c.String("path")) + if err != nil { + return err + } + + var extractedFiles int + for _, root := range roots { + count, err := extractRoot(c, &ls, root, outputDir, path) + if err != nil { + return err + } + extractedFiles += count + } + if extractedFiles == 0 { + return cli.Exit("no files extracted", 1) + } else { + fmt.Fprintf(c.App.ErrWriter, "extracted %d file(s)\n", extractedFiles) + } + + return nil +} + +func extractRoot(c *cli.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string, path []string) (int, error) { + if root.Prefix().Codec == cid.Raw { + if c.IsSet("verbose") { + fmt.Fprintf(c.App.ErrWriter, "skipping raw root %s\n", root) + } + return 0, nil + } + + pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: root}, dagpb.Type.PBNode) + if err != nil { + return 0, err + } + pbnode := pbn.(dagpb.PBNode) + + ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) + if err != nil { + return 0, err + } + + var outputResolvedDir string + if outputDir != "-" { + outputResolvedDir, err = filepath.EvalSymlinks(outputDir) + if err != nil { + return 0, err + } + if _, err := os.Stat(outputResolvedDir); os.IsNotExist(err) { + if err := os.Mkdir(outputResolvedDir, 0755); err != nil { + return 0, err + } + } + } + + count, err := extractDir(c, ls, ufn, outputResolvedDir, "/", path) + if err != nil { + if !errors.Is(err, ErrNotDir) { + return 0, fmt.Errorf("%s: %w", root, err) + } + + // if it's not a directory, it's a file. + ufsData, err := pbnode.LookupByString("Data") + if err != nil { + return 0, err + } + ufsBytes, err := ufsData.AsBytes() + if err != nil { + return 0, err + } + ufsNode, err := data.DecodeUnixFSData(ufsBytes) + if err != nil { + return 0, err + } + var outputName string + if outputDir != "-" { + outputName = filepath.Join(outputResolvedDir, "unknown") + } + if ufsNode.DataType.Int() == data.Data_File || ufsNode.DataType.Int() == data.Data_Raw { + if err := extractFile(c, ls, pbnode, outputName); err != nil { + return 0, err + } + } + return 1, nil + } + + return count, nil +} + +func resolvePath(root, pth string) (string, error) { + rp, err := filepath.Rel("/", pth) + if err != nil { + return "", fmt.Errorf("couldn't check relative-ness of %s: %w", pth, err) + } + joined := path.Join(root, rp) + + basename := path.Dir(joined) + final, err := filepath.EvalSymlinks(basename) + if err != nil { + return "", fmt.Errorf("couldn't eval symlinks in %s: %w", basename, err) + } + if final != path.Clean(basename) { + return "", fmt.Errorf("path attempts to redirect through symlinks") + } + return joined, nil +} + +func extractDir(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputRoot, outputPath string, matchPath []string) (int, error) { + if outputRoot != "" { + dirPath, err := resolvePath(outputRoot, outputPath) + if err != nil { + return 0, err + } + // make the directory. + if err := os.MkdirAll(dirPath, 0755); err != nil { + return 0, err + } + } + + if n.Kind() != ipld.Kind_Map { + return 0, ErrNotDir + } + + subPath := matchPath + if len(matchPath) > 0 { + subPath = matchPath[1:] + } + + extractElement := func(name string, n ipld.Node) (int, error) { + var nextRes string + if outputRoot != "" { + var err error + nextRes, err = resolvePath(outputRoot, path.Join(outputPath, name)) + if err != nil { + return 0, err + } + if c.IsSet("verbose") { + fmt.Fprintf(c.App.Writer, "%s\n", nextRes) + } + } + + if n.Kind() != ipld.Kind_Link { + return 0, fmt.Errorf("unexpected map value for %s at %s", name, outputPath) + } + // a directory may be represented as a map of name: if unixADL is applied + vl, err := n.AsLink() + if err != nil { + return 0, err + } + dest, err := ls.Load(ipld.LinkContext{}, vl, basicnode.Prototype.Any) + if err != nil { + if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { + fmt.Fprintf(c.App.ErrWriter, "data for entry not found: %s (skipping...)\n", path.Join(outputPath, name)) + return 0, nil + } + return 0, err + } + // degenerate files are handled here. + if dest.Kind() == ipld.Kind_Bytes { + if err := extractFile(c, ls, dest, nextRes); err != nil { + return 0, err + } + return 1, nil + } + + // dir / pbnode + pbb := dagpb.Type.PBNode.NewBuilder() + if err := pbb.AssignNode(dest); err != nil { + return 0, err + } + pbnode := pbb.Build().(dagpb.PBNode) + + // interpret dagpb 'data' as unixfs data and look at type. + ufsData, err := pbnode.LookupByString("Data") + if err != nil { + return 0, err + } + ufsBytes, err := ufsData.AsBytes() + if err != nil { + return 0, err + } + ufsNode, err := data.DecodeUnixFSData(ufsBytes) + if err != nil { + return 0, err + } + + switch ufsNode.DataType.Int() { + case data.Data_Directory, data.Data_HAMTShard: + ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) + if err != nil { + return 0, err + } + return extractDir(c, ls, ufn, outputRoot, path.Join(outputPath, name), subPath) + case data.Data_File, data.Data_Raw: + if err := extractFile(c, ls, pbnode, nextRes); err != nil { + return 0, err + } + return 1, nil + case data.Data_Symlink: + if nextRes == "" { + return 0, fmt.Errorf("cannot extract a symlink to stdout") + } + data := ufsNode.Data.Must().Bytes() + if err := os.Symlink(string(data), nextRes); err != nil { + return 0, err + } + return 1, nil + default: + return 0, fmt.Errorf("unknown unixfs type: %d", ufsNode.DataType.Int()) + } + } + + // specific path segment + if len(matchPath) > 0 { + val, err := n.LookupByString(matchPath[0]) + if err != nil { + return 0, err + } + return extractElement(matchPath[0], val) + } + + if outputPath == "-" && len(matchPath) == 0 { + return 0, fmt.Errorf("cannot extract a directory to stdout, use a path to extract a specific file") + } + + // everything + var count int + var shardSkip int + mi := n.MapIterator() + for !mi.Done() { + key, val, err := mi.Next() + if err != nil { + if nf, ok := err.(interface{ NotFound() bool }); ok && nf.NotFound() { + shardSkip++ + continue + } + return 0, err + } + ks, err := key.AsString() + if err != nil { + return 0, err + } + ecount, err := extractElement(ks, val) + if err != nil { + return 0, err + } + count += ecount + } + if shardSkip > 0 { + fmt.Fprintf(c.App.ErrWriter, "data for entry not found for %d unknown sharded entries (skipped...)\n", shardSkip) + } + return count, nil +} + +func extractFile(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputName string) error { + node, err := file.NewUnixFSFile(c.Context, n, ls) + if err != nil { + return err + } + nlr, err := node.AsLargeBytes() + if err != nil { + return err + } + var f *os.File + if outputName == "" { + f = os.Stdout + } else { + f, err = os.Create(outputName) + if err != nil { + return err + } + defer f.Close() + } + _, err = io.Copy(f, nlr) + return err +} + +// TODO: dedupe this with lassie, probably into go-unixfsnode +func pathSegments(path string) ([]string, error) { + segments := strings.Split(path, "/") + filtered := make([]string, 0, len(segments)) + for i := 0; i < len(segments); i++ { + if segments[i] == "" { + // Allow one leading and one trailing '/' at most + if i == 0 || i == len(segments)-1 { + continue + } + return nil, fmt.Errorf("invalid empty path segment at position %d", i) + } + if segments[i] == "." || segments[i] == ".." { + return nil, fmt.Errorf("'%s' is unsupported in paths", segments[i]) + } + filtered = append(filtered, segments[i]) + } + return filtered, nil +} + +var _ storage.ReadableStorage = (*stdinReadStorage)(nil) + +type stdinReadStorage struct { + blocks map[string][]byte + done bool + lk *sync.RWMutex + cond *sync.Cond +} + +func NewStdinReadStorage(reader io.Reader) (*stdinReadStorage, []cid.Cid, error) { + var lk sync.RWMutex + srs := &stdinReadStorage{ + blocks: make(map[string][]byte), + lk: &lk, + cond: sync.NewCond(&lk), + } + rdr, err := car.NewBlockReader(reader) + if err != nil { + return nil, nil, err + } + go func() { + for { + blk, err := rdr.Next() + if err == io.EOF { + srs.lk.Lock() + srs.done = true + srs.lk.Unlock() + return + } + if err != nil { + panic(err) + } + srs.lk.Lock() + srs.blocks[string(blk.Cid().Hash())] = blk.RawData() + srs.cond.Broadcast() + srs.lk.Unlock() + } + }() + return srs, rdr.Roots, nil +} + +func (srs *stdinReadStorage) Has(ctx context.Context, key string) (bool, error) { + _, err := srs.Get(ctx, key) + if err != nil { + return false, err + } + return true, nil +} + +func (srs *stdinReadStorage) Get(ctx context.Context, key string) ([]byte, error) { + c, err := cid.Cast([]byte(key)) + if err != nil { + return nil, err + } + srs.lk.Lock() + defer srs.lk.Unlock() + for { + if data, ok := srs.blocks[string(c.Hash())]; ok { + return data, nil + } + if srs.done { + return nil, carstorage.ErrNotFound{Cid: c} + } + srs.cond.Wait() + } +} diff --git a/ipld/car/cmd/car/filter.go b/ipld/car/cmd/car/filter.go new file mode 100644 index 0000000000..a76b6bd05a --- /dev/null +++ b/ipld/car/cmd/car/filter.go @@ -0,0 +1,128 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + "github.com/urfave/cli/v2" +) + +// FilterCar is a command to select a subset of a car by CID. +func FilterCar(c *cli.Context) error { + if c.Args().Len() < 2 { + return fmt.Errorf("an output filename must be provided") + } + + fd, err := os.Open(c.Args().First()) + if err != nil { + return err + } + defer fd.Close() + rd, err := carv2.NewBlockReader(fd) + if err != nil { + return err + } + + // Get the set of CIDs from stdin. + inStream := os.Stdin + if c.IsSet("cidFile") { + inStream, err = os.Open(c.String("cidFile")) + if err != nil { + return err + } + defer inStream.Close() + } + cidMap, err := parseCIDS(inStream) + if err != nil { + return err + } + fmt.Printf("filtering to %d cids\n", len(cidMap)) + + outRoots := make([]cid.Cid, 0) + for _, r := range rd.Roots { + if _, ok := cidMap[r]; ok { + outRoots = append(outRoots, r) + } + } + + outPath := c.Args().Get(1) + if !c.Bool("append") { + if _, err := os.Stat(outPath); err == nil || !os.IsNotExist(err) { + // output to an existing file. + if err := os.Truncate(outPath, 0); err != nil { + return err + } + } + } else { + // roots will need to be whatever is in the output already. + cv2r, err := carv2.OpenReader(outPath) + if err != nil { + return err + } + if cv2r.Version != 2 { + return fmt.Errorf("can only append to version 2 car files") + } + outRoots, err = cv2r.Roots() + if err != nil { + return err + } + _ = cv2r.Close() + } + + if len(outRoots) == 0 { + fmt.Fprintf(os.Stderr, "warning: no roots defined after filtering\n") + } + + bs, err := blockstore.OpenReadWrite(outPath, outRoots) + if err != nil { + return err + } + + for { + blk, err := rd.Next() + if err != nil { + if err == io.EOF { + break + } + return err + } + if _, ok := cidMap[blk.Cid()]; ok { + if err := bs.Put(c.Context, blk); err != nil { + return err + } + } + } + return bs.Finalize() +} + +func parseCIDS(r io.Reader) (map[cid.Cid]struct{}, error) { + cids := make(map[cid.Cid]struct{}) + br := bufio.NewReader(r) + for { + line, _, err := br.ReadLine() + if err != nil { + if err == io.EOF { + return cids, nil + } + return nil, err + } + trimLine := strings.TrimSpace(string(line)) + if len(trimLine) == 0 { + continue + } + c, err := cid.Parse(trimLine) + if err != nil { + return nil, err + } + if _, ok := cids[c]; ok { + fmt.Fprintf(os.Stderr, "duplicate cid: %s\n", c) + } + cids[c] = struct{}{} + } +} diff --git a/ipld/car/cmd/car/get.go b/ipld/car/cmd/car/get.go new file mode 100644 index 0000000000..f5d5b1cdfa --- /dev/null +++ b/ipld/car/cmd/car/get.go @@ -0,0 +1,215 @@ +package main + +import ( + "bytes" + "context" + "fmt" + + "io" + "os" + + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + _ "github.com/ipld/go-ipld-prime/codec/cbor" + _ "github.com/ipld/go-ipld-prime/codec/dagcbor" + _ "github.com/ipld/go-ipld-prime/codec/dagjson" + _ "github.com/ipld/go-ipld-prime/codec/json" + _ "github.com/ipld/go-ipld-prime/codec/raw" + + "github.com/ipfs/go-cid" + ipldfmt "github.com/ipfs/go-ipld-format" + "github.com/ipfs/go-unixfsnode" + "github.com/ipld/go-car" + "github.com/ipld/go-car/v2/blockstore" + "github.com/ipld/go-ipld-prime/datamodel" + "github.com/ipld/go-ipld-prime/linking" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/ipld/go-ipld-prime/node/basicnode" + "github.com/ipld/go-ipld-prime/traversal" + "github.com/ipld/go-ipld-prime/traversal/selector" + selectorParser "github.com/ipld/go-ipld-prime/traversal/selector/parse" + "github.com/urfave/cli/v2" +) + +// GetCarBlock is a command to get a block out of a car +func GetCarBlock(c *cli.Context) error { + if c.Args().Len() < 2 { + return fmt.Errorf("usage: car get-block [output file]") + } + + bs, err := blockstore.OpenReadOnly(c.Args().Get(0)) + if err != nil { + return err + } + + // string to CID + blkCid, err := cid.Parse(c.Args().Get(1)) + if err != nil { + return err + } + + blk, err := bs.Get(c.Context, blkCid) + if err != nil { + return err + } + + outStream := os.Stdout + if c.Args().Len() >= 3 { + outStream, err = os.Create(c.Args().Get(2)) + if err != nil { + return err + } + defer outStream.Close() + } + + _, err = outStream.Write(blk.RawData()) + return err +} + +// GetCarDag is a command to get a dag out of a car +func GetCarDag(c *cli.Context) error { + if c.Args().Len() < 2 { + return fmt.Errorf("usage: car get-dag [-s selector] [root cid] ") + } + + // if root cid is emitted we'll read it from the root of file.car. + output := c.Args().Get(1) + var rootCid cid.Cid + + bs, err := blockstore.OpenReadOnly(c.Args().Get(0)) + if err != nil { + return err + } + + if c.Args().Len() == 2 { + roots, err := bs.Roots() + if err != nil { + return err + } + if len(roots) != 1 { + return fmt.Errorf("car file has does not have exactly one root, dag root must be specified explicitly") + } + rootCid = roots[0] + } else { + rootCid, err = cid.Parse(output) + if err != nil { + return err + } + output = c.Args().Get(2) + } + + strict := c.Bool("strict") + + // selector traversal, default to ExploreAllRecursively which only explores the DAG blocks + // because we only care about the blocks loaded during the walk, not the nodes matched + sel := selectorParser.CommonSelector_MatchAllRecursively + if c.IsSet("selector") { + sel, err = selectorParser.ParseJSONSelector(c.String("selector")) + if err != nil { + return err + } + } + linkVisitOnlyOnce := !c.IsSet("selector") // if using a custom selector, this isn't as safe + + switch c.Int("version") { + case 2: + return writeCarV2(c.Context, rootCid, output, bs, strict, sel, linkVisitOnlyOnce) + case 1: + return writeCarV1(rootCid, output, bs, strict, sel, linkVisitOnlyOnce) + default: + return fmt.Errorf("invalid CAR version %d", c.Int("version")) + } +} + +func writeCarV2(ctx context.Context, rootCid cid.Cid, output string, bs *blockstore.ReadOnly, strict bool, sel datamodel.Node, linkVisitOnlyOnce bool) error { + _ = os.Remove(output) + + outStore, err := blockstore.OpenReadWrite(output, []cid.Cid{rootCid}, blockstore.AllowDuplicatePuts(false)) + if err != nil { + return err + } + + ls := cidlink.DefaultLinkSystem() + ls.KnownReifiers = map[string]linking.NodeReifier{"unixfs": unixfsnode.Reify} + ls.TrustedStorage = true + ls.StorageReadOpener = func(_ linking.LinkContext, l datamodel.Link) (io.Reader, error) { + if cl, ok := l.(cidlink.Link); ok { + blk, err := bs.Get(ctx, cl.Cid) + if err != nil { + if ipldfmt.IsNotFound(err) { + if strict { + return nil, err + } + return nil, traversal.SkipMe{} + } + return nil, err + } + if err := outStore.Put(ctx, blk); err != nil { + return nil, err + } + return bytes.NewBuffer(blk.RawData()), nil + } + return nil, fmt.Errorf("unknown link type: %T", l) + } + + nsc := func(lnk datamodel.Link, lctx ipld.LinkContext) (datamodel.NodePrototype, error) { + if lnk, ok := lnk.(cidlink.Link); ok && lnk.Cid.Prefix().Codec == 0x70 { + return dagpb.Type.PBNode, nil + } + return basicnode.Prototype.Any, nil + } + + rootLink := cidlink.Link{Cid: rootCid} + ns, _ := nsc(rootLink, ipld.LinkContext{}) + rootNode, err := ls.Load(ipld.LinkContext{}, rootLink, ns) + if err != nil { + return err + } + + traversalProgress := traversal.Progress{ + Cfg: &traversal.Config{ + LinkSystem: ls, + LinkTargetNodePrototypeChooser: nsc, + LinkVisitOnlyOnce: linkVisitOnlyOnce, + }, + } + + s, err := selector.CompileSelector(sel) + if err != nil { + return err + } + + err = traversalProgress.WalkMatching(rootNode, s, func(p traversal.Progress, n datamodel.Node) error { + lb, ok := n.(datamodel.LargeBytesNode) + if ok { + rs, err := lb.AsLargeBytes() + if err == nil { + _, err := io.Copy(io.Discard, rs) + if err != nil { + return err + } + } + } + return nil + }) + if err != nil { + return err + } + + return outStore.Finalize() +} + +func writeCarV1(rootCid cid.Cid, output string, bs *blockstore.ReadOnly, strict bool, sel datamodel.Node, linkVisitOnlyOnce bool) error { + opts := make([]car.Option, 0) + if linkVisitOnlyOnce { + opts = append(opts, car.TraverseLinksOnlyOnce()) + } + sc := car.NewSelectiveCar(context.Background(), bs, []car.Dag{{Root: rootCid, Selector: sel}}, opts...) + f, err := os.Create(output) + if err != nil { + return err + } + defer f.Close() + + return sc.Write(f) +} diff --git a/ipld/car/cmd/car/index.go b/ipld/car/cmd/car/index.go new file mode 100644 index 0000000000..fc6bd5d611 --- /dev/null +++ b/ipld/car/cmd/car/index.go @@ -0,0 +1,210 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "os" + + "github.com/ipfs/go-cid" + carv1 "github.com/ipld/go-car" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-varint" + "github.com/urfave/cli/v2" +) + +// IndexCar is a command to add an index to a car +func IndexCar(c *cli.Context) error { + r, err := carv2.OpenReader(c.Args().Get(0)) + if err != nil { + return err + } + defer r.Close() + + if c.Int("version") == 1 { + if c.IsSet("codec") && c.String("codec") != "none" { + return fmt.Errorf("'none' is the only supported codec for a v1 car") + } + outStream := os.Stdout + if c.Args().Len() >= 2 { + outStream, err = os.Create(c.Args().Get(1)) + if err != nil { + return err + } + } + defer outStream.Close() + + dr, err := r.DataReader() + if err != nil { + return err + } + _, err = io.Copy(outStream, dr) + return err + } + + if c.Int("version") != 2 { + return fmt.Errorf("invalid CAR version %d", c.Int("version")) + } + + var idx index.Index + if c.String("codec") != "none" { + var mc multicodec.Code + if err := mc.Set(c.String("codec")); err != nil { + return err + } + idx, err = index.New(mc) + if err != nil { + return err + } + } + + outStream := os.Stdout + if c.Args().Len() >= 2 { + outStream, err = os.Create(c.Args().Get(1)) + if err != nil { + return err + } + } + defer outStream.Close() + + v1r, err := r.DataReader() + if err != nil { + return err + } + + if r.Version == 1 { + fi, err := os.Stat(c.Args().Get(0)) + if err != nil { + return err + } + r.Header.DataSize = uint64(fi.Size()) + } + v2Header := carv2.NewHeader(r.Header.DataSize) + if c.String("codec") == "none" { + v2Header.IndexOffset = 0 + if _, err := outStream.Write(carv2.Pragma); err != nil { + return err + } + if _, err := v2Header.WriteTo(outStream); err != nil { + return err + } + if _, err := io.Copy(outStream, v1r); err != nil { + return err + } + return nil + } + + if _, err := outStream.Write(carv2.Pragma); err != nil { + return err + } + if _, err := v2Header.WriteTo(outStream); err != nil { + return err + } + + // collect records as we go through the v1r + br := bufio.NewReader(v1r) + hdr, err := carv1.ReadHeader(br) + if err != nil { + return fmt.Errorf("error reading car header: %w", err) + } + if err := carv1.WriteHeader(hdr, outStream); err != nil { + return err + } + + records := make([]index.Record, 0) + var sectionOffset int64 + if sectionOffset, err = v1r.Seek(0, io.SeekCurrent); err != nil { + return err + } + sectionOffset -= int64(br.Buffered()) + + for { + // Read the section's length. + sectionLen, err := varint.ReadUvarint(br) + if err != nil { + if err == io.EOF { + break + } + return err + } + if _, err := outStream.Write(varint.ToUvarint(sectionLen)); err != nil { + return err + } + + // Null padding; by default it's an error. + // TODO: integrate corresponding ReadOption + if sectionLen == 0 { + // TODO: pad writer to expected length. + break + } + + // Read the CID. + cidLen, c, err := cid.CidFromReader(br) + if err != nil { + return err + } + records = append(records, index.Record{Cid: c, Offset: uint64(sectionOffset)}) + if _, err := c.WriteBytes(outStream); err != nil { + return err + } + + // Seek to the next section by skipping the block. + // The section length includes the CID, so subtract it. + remainingSectionLen := int64(sectionLen) - int64(cidLen) + if _, err := io.CopyN(outStream, br, remainingSectionLen); err != nil { + return err + } + sectionOffset += int64(sectionLen) + int64(varint.UvarintSize(sectionLen)) + } + + if err := idx.Load(records); err != nil { + return err + } + + _, err = index.WriteTo(idx, outStream) + return err +} + +// CreateIndex is a command to write out an index of the CAR file +func CreateIndex(c *cli.Context) error { + r, err := carv2.OpenReader(c.Args().Get(0)) + if err != nil { + return err + } + defer r.Close() + + outStream := os.Stdout + if c.Args().Len() >= 2 { + outStream, err = os.Create(c.Args().Get(1)) + if err != nil { + return err + } + } + defer outStream.Close() + + var mc multicodec.Code + if err := mc.Set(c.String("codec")); err != nil { + return err + } + idx, err := index.New(mc) + if err != nil { + return err + } + + dr, err := r.DataReader() + if err != nil { + return err + } + + if err := carv2.LoadIndex(idx, dr); err != nil { + return err + } + + if _, err := index.WriteTo(idx, outStream); err != nil { + return err + } + + return nil +} diff --git a/ipld/car/cmd/car/inspect.go b/ipld/car/cmd/car/inspect.go new file mode 100644 index 0000000000..f320500cd4 --- /dev/null +++ b/ipld/car/cmd/car/inspect.go @@ -0,0 +1,137 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "os" + "sort" + "strings" + + carv2 "github.com/ipld/go-car/v2" + "github.com/multiformats/go-multicodec" + "github.com/urfave/cli/v2" +) + +// InspectCar verifies a CAR and prints a basic report about its contents +func InspectCar(c *cli.Context) (err error) { + inStream := os.Stdin + if c.Args().Len() >= 1 { + inStream, err = os.Open(c.Args().First()) + if err != nil { + return err + } + } + + rd, err := carv2.NewReader(inStream, carv2.ZeroLengthSectionAsEOF(true)) + if err != nil { + return err + } + stats, err := rd.Inspect(c.IsSet("full")) + if err != nil { + return err + } + + if stats.Version == 1 && c.IsSet("full") { // check that we've read all the data + got, err := inStream.Read(make([]byte, 1)) // force EOF + if err != nil && err != io.EOF { + return err + } else if got > 0 { + return fmt.Errorf("unexpected data after EOF: %d", got) + } + } + + var v2s string + if stats.Version == 2 { + idx := "(none)" + if stats.IndexCodec != 0 { + idx = stats.IndexCodec.String() + } + var buf bytes.Buffer + stats.Header.Characteristics.WriteTo(&buf) + v2s = fmt.Sprintf(`Characteristics: %x +Data offset: %d +Data (payload) length: %d +Index offset: %d +Index type: %s +`, buf.Bytes(), stats.Header.DataOffset, stats.Header.DataSize, stats.Header.IndexOffset, idx) + } + + var roots strings.Builder + switch len(stats.Roots) { + case 0: + roots.WriteString(" (none)") + case 1: + roots.WriteString(" ") + roots.WriteString(stats.Roots[0].String()) + default: + for _, r := range stats.Roots { + roots.WriteString("\n\t") + roots.WriteString(r.String()) + } + } + + var codecs strings.Builder + { + keys := make([]int, len(stats.CodecCounts)) + i := 0 + for codec := range stats.CodecCounts { + keys[i] = int(codec) + i++ + } + sort.Ints(keys) + for _, code := range keys { + codec := multicodec.Code(code) + codecs.WriteString(fmt.Sprintf("\n\t%s: %d", codec, stats.CodecCounts[codec])) + } + } + + var hashers strings.Builder + { + keys := make([]int, len(stats.MhTypeCounts)) + i := 0 + for codec := range stats.MhTypeCounts { + keys[i] = int(codec) + i++ + } + sort.Ints(keys) + for _, code := range keys { + codec := multicodec.Code(code) + hashers.WriteString(fmt.Sprintf("\n\t%s: %d", codec, stats.MhTypeCounts[codec])) + } + } + + rp := "No" + if stats.RootsPresent { + rp = "Yes" + } + + pfmt := `Version: %d +%sRoots:%s +Root blocks present in data: %s +Block count: %d +Min / average / max block length (bytes): %d / %d / %d +Min / average / max CID length (bytes): %d / %d / %d +Block count per codec:%s +CID count per multihash:%s +` + + fmt.Printf( + pfmt, + stats.Version, + v2s, + roots.String(), + rp, + stats.BlockCount, + stats.MinBlockLength, + stats.AvgBlockLength, + stats.MaxBlockLength, + stats.MinCidLength, + stats.AvgCidLength, + stats.MaxCidLength, + codecs.String(), + hashers.String(), + ) + + return nil +} diff --git a/ipld/car/cmd/car/list.go b/ipld/car/cmd/car/list.go new file mode 100644 index 0000000000..62cdb265dd --- /dev/null +++ b/ipld/car/cmd/car/list.go @@ -0,0 +1,222 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "os" + "path" + + "github.com/dustin/go-humanize" + "github.com/ipfs/go-cid" + data "github.com/ipfs/go-unixfsnode/data" + "github.com/ipfs/go-unixfsnode/hamt" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/multiformats/go-multicodec" + "github.com/urfave/cli/v2" +) + +// ListCar is a command to output the cids in a car. +func ListCar(c *cli.Context) error { + var err error + outStream := os.Stdout + if c.Args().Len() >= 2 { + outStream, err = os.Create(c.Args().Get(1)) + if err != nil { + return err + } + } + defer outStream.Close() + + if c.Bool("unixfs") { + return listUnixfs(c, outStream) + } + + inStream := os.Stdin + if c.Args().Len() >= 1 { + inStream, err = os.Open(c.Args().First()) + if err != nil { + return err + } + defer inStream.Close() + } + + rd, err := carv2.NewBlockReader(inStream) + if err != nil { + return err + } + + for { + blk, err := rd.Next() + if err != nil { + if err == io.EOF { + break + } + return err + } + if c.Bool("verbose") { + fmt.Fprintf(outStream, "%s: %s\n", + multicodec.Code(blk.Cid().Prefix().Codec).String(), + blk.Cid()) + if blk.Cid().Prefix().Codec == uint64(multicodec.DagPb) { + // parse as dag-pb + builder := dagpb.Type.PBNode.NewBuilder() + if err := dagpb.DecodeBytes(builder, blk.RawData()); err != nil { + fmt.Fprintf(outStream, "\tnot interpretable as dag-pb: %s\n", err) + continue + } + n := builder.Build() + pbn, ok := n.(dagpb.PBNode) + if !ok { + continue + } + dl := 0 + if pbn.Data.Exists() { + dl = len(pbn.Data.Must().Bytes()) + } + fmt.Fprintf(outStream, "\t%d links. %d bytes\n", pbn.Links.Length(), dl) + // example link: + li := pbn.Links.ListIterator() + max := 3 + for !li.Done() { + _, l, _ := li.Next() + max-- + pbl, ok := l.(dagpb.PBLink) + if ok && max >= 0 { + hsh := "" + lnk, ok := pbl.Hash.Link().(cidlink.Link) + if ok { + hsh = lnk.Cid.String() + } + name := "" + if pbl.Name.Exists() { + name = pbl.Name.Must().String() + } + size := 0 + if pbl.Tsize.Exists() { + size = int(pbl.Tsize.Must().Int()) + } + fmt.Fprintf(outStream, "\t\t%s[%s] %s\n", name, humanize.Bytes(uint64(size)), hsh) + } + } + if max < 0 { + fmt.Fprintf(outStream, "\t\t(%d total)\n", 3-max) + } + // see if it's unixfs. + ufd, err := data.DecodeUnixFSData(pbn.Data.Must().Bytes()) + if err != nil { + fmt.Fprintf(outStream, "\tnot interpretable as unixfs: %s\n", err) + continue + } + fmt.Fprintf(outStream, "\tUnixfs %s\n", data.DataTypeNames[ufd.FieldDataType().Int()]) + } + } else { + fmt.Fprintf(outStream, "%s\n", blk.Cid()) + } + } + + return err +} + +func listUnixfs(c *cli.Context, outStream io.Writer) error { + if c.Args().Len() == 0 { + return fmt.Errorf("must provide file to read from. unixfs reading requires random access") + } + + bs, err := blockstore.OpenReadOnly(c.Args().First()) + if err != nil { + return err + } + ls := cidlink.DefaultLinkSystem() + ls.TrustedStorage = true + ls.StorageReadOpener = func(_ ipld.LinkContext, l ipld.Link) (io.Reader, error) { + cl, ok := l.(cidlink.Link) + if !ok { + return nil, fmt.Errorf("not a cidlink") + } + blk, err := bs.Get(c.Context, cl.Cid) + if err != nil { + return nil, err + } + return bytes.NewBuffer(blk.RawData()), nil + } + + roots, err := bs.Roots() + if err != nil { + return err + } + for _, r := range roots { + if err := printUnixFSNode(c, "", r, &ls, outStream); err != nil { + return err + } + } + return nil +} + +func printUnixFSNode(c *cli.Context, prefix string, node cid.Cid, ls *ipld.LinkSystem, outStream io.Writer) error { + // it might be a raw file (bytes) node. if so, not actually an error. + if node.Prefix().Codec == cid.Raw { + return nil + } + + pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: node}, dagpb.Type.PBNode) + if err != nil { + return err + } + + pbnode := pbn.(dagpb.PBNode) + + ufd, err := data.DecodeUnixFSData(pbnode.Data.Must().Bytes()) + if err != nil { + return err + } + + if ufd.FieldDataType().Int() == data.Data_Directory { + i := pbnode.Links.Iterator() + for !i.Done() { + _, l := i.Next() + name := path.Join(prefix, l.Name.Must().String()) + fmt.Fprintf(outStream, "%s\n", name) + // recurse into the file/directory + cl, err := l.Hash.AsLink() + if err != nil { + return err + } + if cidl, ok := cl.(cidlink.Link); ok { + if err := printUnixFSNode(c, name, cidl.Cid, ls, outStream); err != nil { + return err + } + } + + } + } else if ufd.FieldDataType().Int() == data.Data_HAMTShard { + hn, err := hamt.AttemptHAMTShardFromNode(c.Context, pbn, ls) + if err != nil { + return err + } + i := hn.Iterator() + for !i.Done() { + n, l := i.Next() + fmt.Fprintf(outStream, "%s\n", path.Join(prefix, n.String())) + // recurse into the file/directory + cl, err := l.AsLink() + if err != nil { + return err + } + if cidl, ok := cl.(cidlink.Link); ok { + if err := printUnixFSNode(c, path.Join(prefix, n.String()), cidl.Cid, ls, outStream); err != nil { + return err + } + } + } + } else { + // file, file chunk, symlink, other un-named entities. + return nil + } + + return nil +} diff --git a/ipld/car/cmd/car/root.go b/ipld/car/cmd/car/root.go new file mode 100644 index 0000000000..7e8d5b2c4c --- /dev/null +++ b/ipld/car/cmd/car/root.go @@ -0,0 +1,30 @@ +package main + +import ( + "fmt" + "os" + + carv2 "github.com/ipld/go-car/v2" + "github.com/urfave/cli/v2" +) + +// CarRoot prints the root CID in a car +func CarRoot(c *cli.Context) (err error) { + inStream := os.Stdin + if c.Args().Len() >= 1 { + inStream, err = os.Open(c.Args().First()) + if err != nil { + return err + } + } + + rd, err := carv2.NewBlockReader(inStream) + if err != nil { + return err + } + for _, r := range rd.Roots { + fmt.Printf("%s\n", r.String()) + } + + return nil +} diff --git a/ipld/car/cmd/car/script_test.go b/ipld/car/cmd/car/script_test.go new file mode 100644 index 0000000000..dcf8f6b7a8 --- /dev/null +++ b/ipld/car/cmd/car/script_test.go @@ -0,0 +1,34 @@ +package main + +import ( + "flag" + "os" + "path/filepath" + "testing" + + "github.com/rogpeppe/go-internal/testscript" +) + +func TestMain(m *testing.M) { + os.Exit(testscript.RunMain(m, map[string]func() int{ + "car": main1, + })) +} + +var update = flag.Bool("u", false, "update testscript output files") + +func TestScript(t *testing.T) { + t.Parallel() + testscript.Run(t, testscript.Params{ + Dir: filepath.Join("testdata", "script"), + Setup: func(env *testscript.Env) error { + wd, err := os.Getwd() + if err != nil { + return err + } + env.Setenv("INPUTS", filepath.Join(wd, "testdata", "inputs")) + return nil + }, + UpdateScripts: *update, + }) +} diff --git a/ipld/car/cmd/car/testdata/inputs/badheaderlength.car b/ipld/car/cmd/car/testdata/inputs/badheaderlength.car new file mode 100644 index 0000000000..4f6aa54873 --- /dev/null +++ b/ipld/car/cmd/car/testdata/inputs/badheaderlength.car @@ -0,0 +1 @@ + olLʔ<#oKg#H* gversion \ No newline at end of file diff --git a/ipld/car/cmd/car/testdata/inputs/badsectionlength.car b/ipld/car/cmd/car/testdata/inputs/badsectionlength.car new file mode 100644 index 0000000000..8569fb18d0 Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/badsectionlength.car differ diff --git a/ipld/car/cmd/car/testdata/inputs/bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw.block b/ipld/car/cmd/car/testdata/inputs/bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw.block new file mode 100644 index 0000000000..257d8522bd Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw.block differ diff --git a/ipld/car/cmd/car/testdata/inputs/sample-v1.car b/ipld/car/cmd/car/testdata/inputs/sample-v1.car new file mode 100644 index 0000000000..47a61c8c2a Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/sample-v1.car differ diff --git a/ipld/car/cmd/car/testdata/inputs/sample-wrapped-v2.car b/ipld/car/cmd/car/testdata/inputs/sample-wrapped-v2.car new file mode 100644 index 0000000000..35c4d36899 Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/sample-wrapped-v2.car differ diff --git a/ipld/car/cmd/car/testdata/inputs/simple-unixfs-missing-blocks.car b/ipld/car/cmd/car/testdata/inputs/simple-unixfs-missing-blocks.car new file mode 100644 index 0000000000..7e80887138 Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/simple-unixfs-missing-blocks.car differ diff --git a/ipld/car/cmd/car/testdata/inputs/simple-unixfs.car b/ipld/car/cmd/car/testdata/inputs/simple-unixfs.car new file mode 100644 index 0000000000..55b8a571a9 Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/simple-unixfs.car differ diff --git a/ipld/car/cmd/car/testdata/inputs/wikipedia-cryptographic-hash-function.car b/ipld/car/cmd/car/testdata/inputs/wikipedia-cryptographic-hash-function.car new file mode 100644 index 0000000000..c914805e71 Binary files /dev/null and b/ipld/car/cmd/car/testdata/inputs/wikipedia-cryptographic-hash-function.car differ diff --git a/ipld/car/cmd/car/testdata/script/compile.txt b/ipld/car/cmd/car/testdata/script/compile.txt new file mode 100644 index 0000000000..4607ca6c66 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/compile.txt @@ -0,0 +1,28 @@ +# debug a car to patch +car debug -o out.patch ${INPUTS}/sample-v1.car +! stderr . +grep -count=1049 \+\+\+ out.patch + +# recompile to binary +car compile -o out.car out.patch +! stderr . + +# should have same blocks as it started with. +car ls out.car +stdout -count=1043 '^bafy' +stdout -count=6 '^bafk' + +# make a small car +car create --file=small.car foo.txt + +car debug -o small.patch small.car +! stderr . + +car compile -o new.car small.patch +! stderr . + +# confirm roundtrip is stable. +cmp small.car new.car + +-- foo.txt -- +hello world \ No newline at end of file diff --git a/ipld/car/cmd/car/testdata/script/create-extract.txt b/ipld/car/cmd/car/testdata/script/create-extract.txt new file mode 100644 index 0000000000..6dac510b23 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/create-extract.txt @@ -0,0 +1,12 @@ +car create --file=out.car foo.txt bar.txt +mkdir out +car extract -v -f out.car out +stdout -count=2 'txt$' +stderr -count=1 '^extracted 2 file\(s\)$' +car create --file=out2.car out/foo.txt out/bar.txt +cmp out.car out2.car + +-- foo.txt -- +foo content +-- bar.txt -- +bar content diff --git a/ipld/car/cmd/car/testdata/script/create.txt b/ipld/car/cmd/car/testdata/script/create.txt new file mode 100644 index 0000000000..13849e31aa --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/create.txt @@ -0,0 +1,13 @@ +car create --file=out.car foo.txt bar.txt + +car verify out.car +car list --unixfs out.car +stdout -count=2 'txt$' +car list out.car +stdout -count=3 '^baf' +stdout -count=2 '^bafk' + +-- foo.txt -- +foo content +-- bar.txt -- +bar content diff --git a/ipld/car/cmd/car/testdata/script/extract.txt b/ipld/car/cmd/car/testdata/script/extract.txt new file mode 100644 index 0000000000..aa2713beb5 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/extract.txt @@ -0,0 +1,113 @@ +# full DAG export, everything in the CAR +mkdir actual-full +car extract -f ${INPUTS}/simple-unixfs.car actual-full +stderr '^extracted 9 file\(s\)$' +cmp actual-full/a/1/A.txt expected/a/1/A.txt +cmp actual-full/a/2/B.txt expected/a/2/B.txt +cmp actual-full/a/3/C.txt expected/a/3/C.txt +cmp actual-full/b/5/E.txt expected/b/5/E.txt +cmp actual-full/b/6/F.txt expected/b/6/F.txt +cmp actual-full/b/4/D.txt expected/b/4/D.txt +cmp actual-full/c/9/I.txt expected/c/9/I.txt +cmp actual-full/c/7/G.txt expected/c/7/G.txt +cmp actual-full/c/8/H.txt expected/c/8/H.txt + +# full DAG export, everything in the CAR, accepted from stdin +mkdir actual-stdin +stdin ${INPUTS}/simple-unixfs.car +car extract actual-stdin +stderr '^extracted 9 file\(s\)$' +cmp actual-stdin/a/1/A.txt expected/a/1/A.txt +cmp actual-stdin/a/2/B.txt expected/a/2/B.txt +cmp actual-stdin/a/3/C.txt expected/a/3/C.txt +cmp actual-stdin/b/5/E.txt expected/b/5/E.txt +cmp actual-stdin/b/6/F.txt expected/b/6/F.txt +cmp actual-stdin/b/4/D.txt expected/b/4/D.txt +cmp actual-stdin/c/9/I.txt expected/c/9/I.txt +cmp actual-stdin/c/7/G.txt expected/c/7/G.txt +cmp actual-stdin/c/8/H.txt expected/c/8/H.txt + +# full DAG export, everything in the CAR, but the CAR is missing blocks (incomplete DAG) +mkdir actual-missing +car extract -f ${INPUTS}/simple-unixfs-missing-blocks.car actual-missing +stderr -count=1 'data for entry not found: /b/4 \(skipping\.\.\.\)' +stderr -count=1 'data for entry not found: /b/5/E.txt \(skipping\.\.\.\)' +stderr -count=1 'data for entry not found: /b/6 \(skipping\.\.\.\)' +stderr -count=1 '^extracted 6 file\(s\)$' +cmp actual-missing/a/1/A.txt expected/a/1/A.txt +cmp actual-missing/a/2/B.txt expected/a/2/B.txt +cmp actual-missing/a/3/C.txt expected/a/3/C.txt +! exists actual-missing/b/5/E.txt +! exists actual-missing/b/6/F.txt +! exists actual-missing/b/4/D.txt +cmp actual-missing/c/9/I.txt expected/c/9/I.txt +cmp actual-missing/c/7/G.txt expected/c/7/G.txt +cmp actual-missing/c/8/H.txt expected/c/8/H.txt + +# path-based partial export, everything under the path specified (also without leading / in path) +mkdir actual-partial +car extract -f ${INPUTS}/simple-unixfs.car -p b actual-partial +stderr '^extracted 3 file\(s\)$' +! exists actual-partial/a/1/A.txt +! exists actual-partial/a/2/B.txt +! exists actual-partial/a/3/C.txt +cmp actual-partial/b/5/E.txt expected/b/5/E.txt +cmp actual-partial/b/6/F.txt expected/b/6/F.txt +cmp actual-partial/b/4/D.txt expected/b/4/D.txt +! exists actual-partial/c/9/I.txt +! exists actual-partial/c/7/G.txt +! exists actual-partial/c/8/H.txt + +# path-based single-file export (also with leading /) +mkdir actual-single +car extract -f ${INPUTS}/simple-unixfs.car -p /a/2/B.txt actual-single +stderr '^extracted 1 file\(s\)$' +! exists actual-single/a/1/A.txt +cmp actual-single/a/2/B.txt expected/a/2/B.txt +! exists actual-single/a/3/C.txt +! exists actual-single/b/5/E.txt +! exists actual-single/b/6/F.txt +! exists actual-single/b/4/D.txt +! exists actual-single/c/9/I.txt +! exists actual-single/c/7/G.txt +! exists actual-single/c/8/H.txt + +# extract that doesn't yield any files should error +! car extract -f ${INPUTS}/simple-unixfs-missing-blocks.car -p b +stderr '^no files extracted$' + +# car with only one file, nested inside sharded directory, output to stdout +car extract -f ${INPUTS}/wikipedia-cryptographic-hash-function.car -p wiki/Cryptographic_hash_function - +stderr '^extracted 1 file\(s\)$' +stdout -count=1 '^ Cryptographic hash function$' + +# car with only one file, full extract, lots of errors +mkdir actual-wiki +car extract -f ${INPUTS}/wikipedia-cryptographic-hash-function.car actual-wiki +stderr '^extracted 1 file\(s\)$' +stderr -count=1 '^data for entry not found for 570 unknown sharded entries \(skipped\.\.\.\)$' +# random sampling of expected skip errors +stderr -count=1 '^data for entry not found: /wiki/1969_Men''s_World_Ice_Hockey_Championships \(skipping\.\.\.\)$' +stderr -count=1 '^data for entry not found: /wiki/Wrestle_mania_30 \(skipping\.\.\.\)$' +stderr -count=1 '^data for entry not found: /zimdump_version \(skipping\.\.\.\)$' +stderr -count=1 '^data for entry not found: /favicon.ico \(skipping\.\.\.\)$' +stderr -count=1 '^data for entry not found: /index.html \(skipping\.\.\.\)$' + +-- expected/a/1/A.txt -- +a1A +-- expected/a/2/B.txt -- +a2B +-- expected/a/3/C.txt -- +a3C +-- expected/b/5/E.txt -- +b5E +-- expected/b/6/F.txt -- +b6F +-- expected/b/4/D.txt -- +b4D +-- expected/c/9/I.txt -- +c9I +-- expected/c/7/G.txt -- +c7G +-- expected/c/8/H.txt -- +c8H \ No newline at end of file diff --git a/ipld/car/cmd/car/testdata/script/filter.txt b/ipld/car/cmd/car/testdata/script/filter.txt new file mode 100644 index 0000000000..0c0b12de79 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/filter.txt @@ -0,0 +1,30 @@ +# basic filter +stdin filteredcids.txt +car filter ${INPUTS}/sample-wrapped-v2.car out.car +stderr 'warning: no roots defined after filtering' +car list out.car +! stderr . +cmp stdout filteredcids.txt + +# filter with root CID +stdin filteredroot.txt +car filter ${INPUTS}/sample-wrapped-v2.car out.car +! stderr . +car list out.car +! stderr . +cmp stdout filteredroot.txt + +# append other cids +stdin filteredcids.txt +car filter -append ${INPUTS}/sample-wrapped-v2.car out.car +! stderr . +car list out.car +stdout -count=4 '^bafy' + + +-- filteredcids.txt -- +bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw +bafy2bzaceaqtiesyfqd2jibmofz22oolguzf5wscwh73rmeypglfu2xhkptri +bafy2bzacebct3dm7izgyauijzkaf3yd7ylni725k66rq7dfp3jr5ywhpprj3k +-- filteredroot.txt -- +bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy diff --git a/ipld/car/cmd/car/testdata/script/get-block.txt b/ipld/car/cmd/car/testdata/script/get-block.txt new file mode 100644 index 0000000000..e042ee645a --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/get-block.txt @@ -0,0 +1,19 @@ +env SAMPLE_CID='bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw' +env MISSING_CID='bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75xxxxx' + +# "get-block" on a CARv1 with an output file. +car get-block ${INPUTS}/sample-v1.car ${SAMPLE_CID} out.block +cmp out.block ${INPUTS}/${SAMPLE_CID}.block +rm out.block + +# "get-block" on a CARv1 with stdout. +car get-block ${INPUTS}/sample-v1.car ${SAMPLE_CID} +cmp stdout ${INPUTS}/${SAMPLE_CID}.block + +# Short "gb" alias. +car get-block ${INPUTS}/sample-v1.car ${SAMPLE_CID} +cmp stdout ${INPUTS}/${SAMPLE_CID}.block + +# "get-block" on a missing CID. +! car get-block ${INPUTS}/sample-v1.car ${MISSING_CID} +stderr 'ipld: could not find bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75xxxxw' diff --git a/ipld/car/cmd/car/testdata/script/get-dag.txt b/ipld/car/cmd/car/testdata/script/get-dag.txt new file mode 100644 index 0000000000..85751a08f3 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/get-dag.txt @@ -0,0 +1,6 @@ +env SAMPLE_CID='bafy2bzaceaycv7jhaegckatnncu5yugzkrnzeqsppzegufr35lroxxnsnpspu' +car get-dag ${INPUTS}/sample-v1.car ${SAMPLE_CID} out.car +! stderr . +car list out.car +! stderr . +stdout -count=1 '^bafy2bzaceaycv7jhaegckatnncu5yugzkrnzeqsppzegufr35lroxxnsnpspu' \ No newline at end of file diff --git a/ipld/car/cmd/car/testdata/script/index-create.txt b/ipld/car/cmd/car/testdata/script/index-create.txt new file mode 100644 index 0000000000..bfdfe65c1d --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/index-create.txt @@ -0,0 +1,3 @@ +car index create ${INPUTS}/sample-v1.car sample-v1.car.idx +car detach-index ${INPUTS}/sample-wrapped-v2.car sample-wrapped-v2.car.idx +cmp sample-v1.car.idx sample-wrapped-v2.car.idx diff --git a/ipld/car/cmd/car/testdata/script/inspect.txt b/ipld/car/cmd/car/testdata/script/inspect.txt new file mode 100644 index 0000000000..efa20731ad --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/inspect.txt @@ -0,0 +1,43 @@ +car inspect ${INPUTS}/sample-v1.car +cmp stdout v1inspect.txt + +car inspect ${INPUTS}/sample-wrapped-v2.car +cmp stdout v2inspect.txt + +! car inspect ${INPUTS}/badheaderlength.car +stderr 'invalid header data' + +! car inspect ${INPUTS}/badsectionlength.car +stderr 'invalid section data' + +-- v1inspect.txt -- +Version: 1 +Roots: bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy +Root blocks present in data: Yes +Block count: 1049 +Min / average / max block length (bytes): 1 / 417 / 1342 +Min / average / max CID length (bytes): 14 / 37 / 38 +Block count per codec: + raw: 6 + dag-cbor: 1043 +CID count per multihash: + identity: 6 + blake2b-256: 1043 +-- v2inspect.txt -- +Version: 2 +Characteristics: 00000000000000000000000000000000 +Data offset: 51 +Data (payload) length: 479907 +Index offset: 479958 +Index type: car-multihash-index-sorted +Roots: bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy +Root blocks present in data: Yes +Block count: 1049 +Min / average / max block length (bytes): 1 / 417 / 1342 +Min / average / max CID length (bytes): 14 / 37 / 38 +Block count per codec: + raw: 6 + dag-cbor: 1043 +CID count per multihash: + identity: 6 + blake2b-256: 1043 diff --git a/ipld/car/cmd/car/testdata/script/list.txt b/ipld/car/cmd/car/testdata/script/list.txt new file mode 100644 index 0000000000..29bb9021b7 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/list.txt @@ -0,0 +1,16 @@ +env SAMPLE_CID='bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw' +# "list" on a CARv1. +car list ${INPUTS}/sample-v1.car +stdout -count=1043 '^bafy' +stdout -count=6 '^bafk' +stdout -count=1 'bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw' + +# "list" on a CARv2. +car list ${INPUTS}/sample-wrapped-v2.car +stdout -count=1043 '^bafy' +stdout -count=6 '^bafk' +stdout -count=1 'bafy2bzacebohz654namrgmwjjx4qmtwgxixsd7pn4tlanyrc3g3hwj75hlxrw' + +# Short "l" alias. +car l ${INPUTS}/sample-v1.car +stdout -count=1043 '^bafy' diff --git a/ipld/car/cmd/car/testdata/script/root.txt b/ipld/car/cmd/car/testdata/script/root.txt new file mode 100644 index 0000000000..834f3a69d4 --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/root.txt @@ -0,0 +1,15 @@ +car root ${INPUTS}/sample-v1.car +cmp stdout v1root.txt + +car root ${INPUTS}/sample-wrapped-v2.car +cmp stdout v2root.txt + +stop stdin_test_needs_car_fix +stdin ${INPUTS}/sample-wrapped-v2.car +car root +cmp stdout v2root.txt + +-- v1root.txt -- +bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy +-- v2root.txt -- +bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy \ No newline at end of file diff --git a/ipld/car/cmd/car/testdata/script/verify.txt b/ipld/car/cmd/car/testdata/script/verify.txt new file mode 100644 index 0000000000..3ef3c49e4c --- /dev/null +++ b/ipld/car/cmd/car/testdata/script/verify.txt @@ -0,0 +1,3 @@ +# "verify" should exit with code 0 on reasonable cars. +car verify ${INPUTS}/sample-v1.car +car verify ${INPUTS}/sample-wrapped-v2.car diff --git a/ipld/car/cmd/car/verify.go b/ipld/car/cmd/car/verify.go new file mode 100644 index 0000000000..faee3aa36d --- /dev/null +++ b/ipld/car/cmd/car/verify.go @@ -0,0 +1,114 @@ +package main + +import ( + "fmt" + "io" + "os" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/multiformats/go-multihash" + "github.com/urfave/cli/v2" +) + +// VerifyCar is a command to check a files validity +func VerifyCar(c *cli.Context) error { + if c.Args().Len() == 0 { + return fmt.Errorf("usage: car verify ") + } + + // header + rx, err := carv2.OpenReader(c.Args().First()) + if err != nil { + return err + } + defer rx.Close() + roots, err := rx.Roots() + if err != nil { + return err + } + if len(roots) == 0 { + return fmt.Errorf("no roots listed in car header") + } + rootMap := make(map[cid.Cid]struct{}) + for _, r := range roots { + rootMap[r] = struct{}{} + } + + if rx.Version == 2 { + if rx.Header.DataSize == 0 { + return fmt.Errorf("size of wrapped v1 car listed as '0'") + } + + flen, err := os.Stat(c.Args().First()) + if err != nil { + return err + } + lengthToIndex := carv2.PragmaSize + carv2.HeaderSize + rx.Header.DataSize + if uint64(flen.Size()) > lengthToIndex && rx.Header.IndexOffset == 0 { + return fmt.Errorf("header claims no index, but extra bytes in file beyond data size") + } + if rx.Header.DataOffset < carv2.PragmaSize+carv2.HeaderSize { + return fmt.Errorf("data offset places data within carv2 header") + } + if rx.Header.IndexOffset < lengthToIndex { + return fmt.Errorf("index offset overlaps with data. data ends at %d. index offset of %d", lengthToIndex, rx.Header.IndexOffset) + } + } + + // blocks + fd, err := os.Open(c.Args().First()) + if err != nil { + return err + } + rd, err := carv2.NewBlockReader(fd) + if err != nil { + return err + } + + cidList := make([]cid.Cid, 0) + for { + blk, err := rd.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + delete(rootMap, blk.Cid()) + cidList = append(cidList, blk.Cid()) + } + + if len(rootMap) > 0 { + return fmt.Errorf("header lists root(s) not present as a block: %v", rootMap) + } + + // index + if rx.Version == 2 && rx.Header.HasIndex() { + ir, err := rx.IndexReader() + if err != nil { + return err + } + idx, err := index.ReadFrom(ir) + if err != nil { + return err + } + for _, c := range cidList { + cidHash, err := multihash.Decode(c.Hash()) + if err != nil { + return err + } + if cidHash.Code == multihash.IDENTITY { + continue + } + if err := idx.GetAll(c, func(_ uint64) bool { + return true + }); err != nil { + return fmt.Errorf("could not look up known cid %s in index: %w", c, err) + } + } + } + + return nil +} diff --git a/ipld/car/fuzz_test.go b/ipld/car/fuzz_test.go new file mode 100644 index 0000000000..cf68d946eb --- /dev/null +++ b/ipld/car/fuzz_test.go @@ -0,0 +1,34 @@ +//go:build go1.18 + +package car_test + +import ( + "bytes" + "encoding/hex" + "io" + "testing" + + car "github.com/ipld/go-car" +) + +func FuzzCarReader(f *testing.F) { + fixture, err := hex.DecodeString(fixtureStr) + if err != nil { + f.Fatal(err) + } + f.Add(fixture) + + f.Fuzz(func(t *testing.T, data []byte) { + r, err := car.NewCarReader(bytes.NewReader(data)) + if err != nil { + return + } + + for { + _, err = r.Next() + if err == io.EOF { + return + } + } + }) +} diff --git a/ipld/car/options.go b/ipld/car/options.go new file mode 100644 index 0000000000..e317f9cc9e --- /dev/null +++ b/ipld/car/options.go @@ -0,0 +1,51 @@ +package car + +import "math" + +// options holds the configured options after applying a number of +// Option funcs. +type options struct { + TraverseLinksOnlyOnce bool + MaxTraversalLinks uint64 +} + +// Option describes an option which affects behavior when +// interacting with the interface. +type Option func(*options) + +// TraverseLinksOnlyOnce prevents the traversal engine from repeatedly visiting +// the same links more than once. +// +// This can be an efficient strategy for an exhaustive selector where it's known +// that repeat visits won't impact the completeness of execution. However it +// should be used with caution with most other selectors as repeat visits of +// links for different reasons during selector execution can be valid and +// necessary to perform full traversal. +func TraverseLinksOnlyOnce() Option { + return func(sco *options) { + sco.TraverseLinksOnlyOnce = true + } +} + +// MaxTraversalLinks changes the allowed number of links a selector traversal +// can execute before failing. +// +// Note that setting this option may cause an error to be returned from selector +// execution when building a SelectiveCar. +func MaxTraversalLinks(MaxTraversalLinks uint64) Option { + return func(sco *options) { + sco.MaxTraversalLinks = MaxTraversalLinks + } +} + +// applyOptions applies given opts and returns the resulting options. +func applyOptions(opt ...Option) options { + opts := options{ + TraverseLinksOnlyOnce: false, // default: recurse until exhausted + MaxTraversalLinks: math.MaxInt64, // default: traverse all + } + for _, o := range opt { + o(&opts) + } + return opts +} diff --git a/ipld/car/options_test.go b/ipld/car/options_test.go new file mode 100644 index 0000000000..250c672037 --- /dev/null +++ b/ipld/car/options_test.go @@ -0,0 +1,27 @@ +package car + +import ( + "math" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestApplyOptions_SetsExpectedDefaults(t *testing.T) { + require.Equal(t, options{ + MaxTraversalLinks: math.MaxInt64, + TraverseLinksOnlyOnce: false, + }, applyOptions()) +} + +func TestApplyOptions_AppliesOptions(t *testing.T) { + require.Equal(t, + options{ + MaxTraversalLinks: 123, + TraverseLinksOnlyOnce: true, + }, + applyOptions( + MaxTraversalLinks(123), + TraverseLinksOnlyOnce(), + )) +} diff --git a/ipld/car/selectivecar.go b/ipld/car/selectivecar.go new file mode 100644 index 0000000000..14c5b05aba --- /dev/null +++ b/ipld/car/selectivecar.go @@ -0,0 +1,289 @@ +package car + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "math" + + cid "github.com/ipfs/go-cid" + util "github.com/ipld/go-car/util" + "github.com/ipld/go-ipld-prime" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/traversal" + "github.com/ipld/go-ipld-prime/traversal/selector" + + // The dag-pb and raw codecs are necessary for unixfs. + dagpb "github.com/ipld/go-codec-dagpb" + _ "github.com/ipld/go-ipld-prime/codec/raw" +) + +// Dag is a root/selector combo to put into a car +type Dag struct { + Root cid.Cid + Selector ipld.Node +} + +// Block is all information and metadata about a block that is part of a car file +type Block struct { + BlockCID cid.Cid + Data []byte + Offset uint64 + Size uint64 +} + +// SelectiveCar is a car file based on root + selector combos instead of just +// a single root and complete dag walk +type SelectiveCar struct { + ctx context.Context + dags []Dag + store ReadStore + opts options +} + +// OnCarHeaderFunc is called during traversal when the header is created +type OnCarHeaderFunc func(CarHeader) error + +// OnNewCarBlockFunc is called during traveral when a new unique block is encountered +type OnNewCarBlockFunc func(Block) error + +// SelectiveCarPrepared is a SelectiveCar that has already been traversed, such that it +// can be written quicker with Dump. It also contains metadata already collection about +// the Car file like size and number of blocks that go into it +type SelectiveCarPrepared struct { + SelectiveCar + size uint64 + header CarHeader + cids []cid.Cid + userOnNewCarBlocks []OnNewCarBlockFunc +} + +// NewSelectiveCar creates a new SelectiveCar for the given car file based +// a block store and set of root+selector pairs +func NewSelectiveCar(ctx context.Context, store ReadStore, dags []Dag, opts ...Option) SelectiveCar { + return SelectiveCar{ + ctx: ctx, + store: store, + dags: dags, + opts: applyOptions(opts...), + } +} + +func (sc SelectiveCar) traverse(onCarHeader OnCarHeaderFunc, onNewCarBlock OnNewCarBlockFunc) (uint64, error) { + traverser := &selectiveCarTraverser{onCarHeader, onNewCarBlock, 0, cid.NewSet(), sc, cidlink.DefaultLinkSystem()} + traverser.lsys.StorageReadOpener = traverser.loader + return traverser.traverse() +} + +// Prepare traverse a car file and collects data on what is about to be written, but +// does not actually write the file +func (sc SelectiveCar) Prepare(userOnNewCarBlocks ...OnNewCarBlockFunc) (SelectiveCarPrepared, error) { + var header CarHeader + var cids []cid.Cid + + onCarHeader := func(h CarHeader) error { + header = h + return nil + } + onNewCarBlock := func(block Block) error { + cids = append(cids, block.BlockCID) + return nil + } + size, err := sc.traverse(onCarHeader, onNewCarBlock) + if err != nil { + return SelectiveCarPrepared{}, err + } + return SelectiveCarPrepared{sc, size, header, cids, userOnNewCarBlocks}, nil +} + +func (sc SelectiveCar) Write(w io.Writer, userOnNewCarBlocks ...OnNewCarBlockFunc) error { + onCarHeader := func(h CarHeader) error { + if err := WriteHeader(&h, w); err != nil { + return fmt.Errorf("failed to write car header: %s", err) + } + return nil + } + onNewCarBlock := func(block Block) error { + err := util.LdWrite(w, block.BlockCID.Bytes(), block.Data) + if err != nil { + return err + } + for _, userOnNewCarBlock := range userOnNewCarBlocks { + err := userOnNewCarBlock(block) + if err != nil { + return err + } + } + return nil + } + _, err := sc.traverse(onCarHeader, onNewCarBlock) + return err +} + +// Size returns the total size in bytes of the car file that will be written +func (sc SelectiveCarPrepared) Size() uint64 { + return sc.size +} + +// Header returns the header for the car file that will be written +func (sc SelectiveCarPrepared) Header() CarHeader { + return sc.header +} + +// Cids returns the list of unique block cids that will be written to the car file +func (sc SelectiveCarPrepared) Cids() []cid.Cid { + return sc.cids +} + +// Dump writes the car file as quickly as possible based on information already +// collected +func (sc SelectiveCarPrepared) Dump(ctx context.Context, w io.Writer) error { + offset, err := HeaderSize(&sc.header) + if err != nil { + return fmt.Errorf("failed to size car header: %s", err) + } + if err := WriteHeader(&sc.header, w); err != nil { + return fmt.Errorf("failed to write car header: %s", err) + } + for _, c := range sc.cids { + blk, err := sc.store.Get(ctx, c) + if err != nil { + return err + } + raw := blk.RawData() + size := util.LdSize(c.Bytes(), raw) + err = util.LdWrite(w, c.Bytes(), raw) + if err != nil { + return err + } + for _, userOnNewCarBlock := range sc.userOnNewCarBlocks { + err := userOnNewCarBlock(Block{ + BlockCID: c, + Data: raw, + Offset: offset, + Size: size, + }) + if err != nil { + return err + } + } + offset += size + } + return nil +} + +type selectiveCarTraverser struct { + onCarHeader OnCarHeaderFunc + onNewCarBlock OnNewCarBlockFunc + offset uint64 + cidSet *cid.Set + sc SelectiveCar + lsys ipld.LinkSystem +} + +func (sct *selectiveCarTraverser) traverse() (uint64, error) { + err := sct.traverseHeader() + if err != nil { + return 0, err + } + err = sct.traverseBlocks() + if err != nil { + return 0, err + } + return sct.offset, nil +} + +func (sct *selectiveCarTraverser) traverseHeader() error { + roots := make([]cid.Cid, 0, len(sct.sc.dags)) + for _, carDag := range sct.sc.dags { + roots = append(roots, carDag.Root) + } + + header := CarHeader{ + Roots: roots, + Version: 1, + } + + size, err := HeaderSize(&header) + if err != nil { + return err + } + + sct.offset += size + + return sct.onCarHeader(header) +} + +func (sct *selectiveCarTraverser) loader(ctx ipld.LinkContext, lnk ipld.Link) (io.Reader, error) { + cl, ok := lnk.(cidlink.Link) + if !ok { + return nil, errors.New("incorrect link type") + } + c := cl.Cid + blk, err := sct.sc.store.Get(ctx.Ctx, c) + if err != nil { + return nil, err + } + raw := blk.RawData() + if !sct.cidSet.Has(c) { + sct.cidSet.Add(c) + size := util.LdSize(c.Bytes(), raw) + err := sct.onNewCarBlock(Block{ + BlockCID: c, + Data: raw, + Offset: sct.offset, + Size: size, + }) + if err != nil { + return nil, err + } + sct.offset += size + } + return bytes.NewReader(raw), nil +} + +func (sct *selectiveCarTraverser) traverseBlocks() error { + nsc := func(lnk ipld.Link, lctx ipld.LinkContext) (ipld.NodePrototype, error) { + // We can decode all nodes into basicnode's Any, except for + // dagpb nodes, which must explicitly use the PBNode prototype. + if lnk, ok := lnk.(cidlink.Link); ok && lnk.Cid.Prefix().Codec == 0x70 { + return dagpb.Type.PBNode, nil + } + return basicnode.Prototype.Any, nil + } + + for _, carDag := range sct.sc.dags { + parsed, err := selector.ParseSelector(carDag.Selector) + if err != nil { + return err + } + lnk := cidlink.Link{Cid: carDag.Root} + ns, _ := nsc(lnk, ipld.LinkContext{}) // nsc won't error + nd, err := sct.lsys.Load(ipld.LinkContext{Ctx: sct.sc.ctx}, lnk, ns) + if err != nil { + return err + } + prog := traversal.Progress{ + Cfg: &traversal.Config{ + Ctx: sct.sc.ctx, + LinkSystem: sct.lsys, + LinkTargetNodePrototypeChooser: nsc, + LinkVisitOnlyOnce: sct.sc.opts.TraverseLinksOnlyOnce, + }, + } + if sct.sc.opts.MaxTraversalLinks < math.MaxInt64 { + prog.Budget = &traversal.Budget{ + NodeBudget: math.MaxInt64, + LinkBudget: int64(sct.sc.opts.MaxTraversalLinks), + } + } + err = prog.WalkAdv(nd, parsed, func(traversal.Progress, ipld.Node, traversal.VisitReason) error { return nil }) + if err != nil { + return err + } + } + return nil +} diff --git a/ipld/car/selectivecar_test.go b/ipld/car/selectivecar_test.go new file mode 100644 index 0000000000..33c2ce705d --- /dev/null +++ b/ipld/car/selectivecar_test.go @@ -0,0 +1,228 @@ +package car_test + +import ( + "bytes" + "context" + "testing" + + cid "github.com/ipfs/go-cid" + format "github.com/ipfs/go-ipld-format" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-merkledag" + dstest "github.com/ipfs/go-merkledag/test" + car "github.com/ipld/go-car" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/traversal/selector" + "github.com/ipld/go-ipld-prime/traversal/selector/builder" + selectorparse "github.com/ipld/go-ipld-prime/traversal/selector/parse" + "github.com/stretchr/testify/require" +) + +func TestRoundtripSelective(t *testing.T) { + ctx := context.Background() + sourceBserv := dstest.Bserv() + sourceBs := sourceBserv.Blockstore() + dserv := merkledag.NewDAGService(sourceBserv) + a := merkledag.NewRawNode([]byte("aaaa")) + b := merkledag.NewRawNode([]byte("bbbb")) + c := merkledag.NewRawNode([]byte("cccc")) + + nd1 := &merkledag.ProtoNode{} + nd1.AddNodeLink("cat", a) + + nd2 := &merkledag.ProtoNode{} + nd2.AddNodeLink("first", nd1) + nd2.AddNodeLink("dog", b) + nd2.AddNodeLink("repeat", nd1) + + nd3 := &merkledag.ProtoNode{} + nd3.AddNodeLink("second", nd2) + nd3.AddNodeLink("bear", c) + + assertAddNodes(t, dserv, a, b, c, nd1, nd2, nd3) + + ssb := builder.NewSelectorSpecBuilder(basicnode.Prototype.Any) + + // the graph assembled above looks as follows, in order: + // nd3 -> [c, nd2 -> [nd1 -> a, b, nd1 -> a]] + // this selector starts at n3, and traverses a link at index 1 (nd2, the second link, zero indexed) + // it then recursively traverses all of its children + // the only node skipped is 'c' -- link at index 0 immediately below nd3 + // the purpose is simply to show we are not writing the entire merkledag underneath + // nd3 + selector := ssb.ExploreFields(func(efsb builder.ExploreFieldsSpecBuilder) { + efsb.Insert("Links", + ssb.ExploreIndex(1, ssb.ExploreRecursive(selector.RecursionLimitNone(), ssb.ExploreAll(ssb.ExploreRecursiveEdge())))) + }).Node() + + sc := car.NewSelectiveCar(context.Background(), sourceBs, []car.Dag{{Root: nd3.Cid(), Selector: selector}}) + + // write car in one step + buf := new(bytes.Buffer) + blockCount := 0 + var oneStepBlocks []car.Block + err := sc.Write(buf, func(block car.Block) error { + oneStepBlocks = append(oneStepBlocks, block) + blockCount++ + return nil + }) + require.Equal(t, blockCount, 5) + require.NoError(t, err) + + // create a new builder for two-step write + sc2 := car.NewSelectiveCar(context.Background(), sourceBs, []car.Dag{{Root: nd3.Cid(), Selector: selector}}) + + // write car in two steps + var twoStepBlocks []car.Block + scp, err := sc2.Prepare(func(block car.Block) error { + twoStepBlocks = append(twoStepBlocks, block) + return nil + }) + require.NoError(t, err) + buf2 := new(bytes.Buffer) + err = scp.Dump(ctx, buf2) + require.NoError(t, err) + + // verify preparation step correctly assesed length and blocks + require.Equal(t, scp.Size(), uint64(buf.Len())) + require.Equal(t, len(scp.Cids()), blockCount) + + // verify equal data written by both methods + require.Equal(t, buf.Bytes(), buf2.Bytes()) + + // verify equal blocks were passed to user block hook funcs + require.Equal(t, oneStepBlocks, twoStepBlocks) + + // readout car and verify contents + bserv := dstest.Bserv() + ch, err := car.LoadCar(ctx, bserv.Blockstore(), buf) + require.NoError(t, err) + require.Equal(t, len(ch.Roots), 1) + + require.True(t, ch.Roots[0].Equals(nd3.Cid())) + + bs := bserv.Blockstore() + for _, nd := range []format.Node{a, b, nd1, nd2, nd3} { + has, err := bs.Has(ctx, nd.Cid()) + require.NoError(t, err) + require.True(t, has) + } + + for _, nd := range []format.Node{c} { + has, err := bs.Has(ctx, nd.Cid()) + require.NoError(t, err) + require.False(t, has) + } +} + +func TestNoLinkRepeatSelective(t *testing.T) { + sourceBserv := dstest.Bserv() + sourceBs := countingReadStore{bs: sourceBserv.Blockstore()} + dserv := merkledag.NewDAGService(sourceBserv) + a := merkledag.NewRawNode([]byte("aaaa")) + b := merkledag.NewRawNode([]byte("bbbb")) + c := merkledag.NewRawNode([]byte("cccc")) + + nd1 := &merkledag.ProtoNode{} + nd1.AddNodeLink("cat", a) + + nd2 := &merkledag.ProtoNode{} + nd2.AddNodeLink("first", nd1) + nd2.AddNodeLink("dog", b) + nd2.AddNodeLink("repeat", nd1) + + nd3 := &merkledag.ProtoNode{} + nd3.AddNodeLink("second", nd2) + nd3.AddNodeLink("bear", c) + nd3.AddNodeLink("bearagain1", c) + nd3.AddNodeLink("bearagain2", c) + nd3.AddNodeLink("bearagain3", c) + + assertAddNodes(t, dserv, a, b, c, nd1, nd2, nd3) + + t.Run("TraverseLinksOnlyOnce off", func(t *testing.T) { + sourceBs.count = 0 + sc := car.NewSelectiveCar(context.Background(), + &sourceBs, + []car.Dag{{Root: nd3.Cid(), Selector: selectorparse.CommonSelector_ExploreAllRecursively}}, + ) + + buf := new(bytes.Buffer) + blockCount := 0 + err := sc.Write(buf, func(block car.Block) error { + blockCount++ + return nil + }) + require.Equal(t, blockCount, 6) + require.Equal(t, sourceBs.count, 11) // with TraverseLinksOnlyOnce off, we expect repeat block visits because our DAG has repeat links + require.NoError(t, err) + }) + + t.Run("TraverseLinksOnlyOnce on", func(t *testing.T) { + sourceBs.count = 0 + + sc := car.NewSelectiveCar(context.Background(), + &sourceBs, + []car.Dag{{Root: nd3.Cid(), Selector: selectorparse.CommonSelector_ExploreAllRecursively}}, + car.TraverseLinksOnlyOnce(), + ) + + buf := new(bytes.Buffer) + blockCount := 0 + err := sc.Write(buf, func(block car.Block) error { + blockCount++ + return nil + }) + require.Equal(t, blockCount, 6) + require.Equal(t, sourceBs.count, 6) // only 6 blocks to load, no duplicate loading expected + require.NoError(t, err) + }) +} + +func TestLinkLimitSelective(t *testing.T) { + sourceBserv := dstest.Bserv() + sourceBs := sourceBserv.Blockstore() + dserv := merkledag.NewDAGService(sourceBserv) + a := merkledag.NewRawNode([]byte("aaaa")) + b := merkledag.NewRawNode([]byte("bbbb")) + c := merkledag.NewRawNode([]byte("cccc")) + + nd1 := &merkledag.ProtoNode{} + nd1.AddNodeLink("cat", a) + + nd2 := &merkledag.ProtoNode{} + nd2.AddNodeLink("first", nd1) + nd2.AddNodeLink("dog", b) + nd2.AddNodeLink("repeat", nd1) + + nd3 := &merkledag.ProtoNode{} + nd3.AddNodeLink("second", nd2) + nd3.AddNodeLink("bear", c) + + assertAddNodes(t, dserv, a, b, c, nd1, nd2, nd3) + + sc := car.NewSelectiveCar(context.Background(), + sourceBs, + []car.Dag{{Root: nd3.Cid(), Selector: selectorparse.CommonSelector_ExploreAllRecursively}}, + car.MaxTraversalLinks(2)) + + buf := new(bytes.Buffer) + blockCount := 0 + err := sc.Write(buf, func(block car.Block) error { + blockCount++ + return nil + }) + require.Equal(t, blockCount, 3) // root + 2 + require.Error(t, err) + require.Regexp(t, "^traversal budget exceeded: budget for links reached zero while on path .*", err) +} + +type countingReadStore struct { + bs car.ReadStore + count int +} + +func (rs *countingReadStore) Get(ctx context.Context, c cid.Cid) (blocks.Block, error) { + rs.count++ + return rs.bs.Get(ctx, c) +} diff --git a/ipld/car/testdata/fuzz/FuzzCarReader/21a90a70853c333c6b9ddc133bae39a14164874ed8abdee1f6a6795311a0e546 b/ipld/car/testdata/fuzz/FuzzCarReader/21a90a70853c333c6b9ddc133bae39a14164874ed8abdee1f6a6795311a0e546 new file mode 100644 index 0000000000..a7ab1d519c --- /dev/null +++ b/ipld/car/testdata/fuzz/FuzzCarReader/21a90a70853c333c6b9ddc133bae39a14164874ed8abdee1f6a6795311a0e546 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xe0\xe0\xe0\xe0\xa7\x06\folLʔ<#oK\x19g#H\x96\b\xed\xb4*\x8b\x8f\xa8\vgversion\x19") diff --git a/ipld/car/testdata/fuzz/FuzzCarReader/5857e57e4072c6b0d8684030cc13b5570849c89b3dbf5bc0152abc66c9642f3e b/ipld/car/testdata/fuzz/FuzzCarReader/5857e57e4072c6b0d8684030cc13b5570849c89b3dbf5bc0152abc66c9642f3e new file mode 100644 index 0000000000..3e680cf60e --- /dev/null +++ b/ipld/car/testdata/fuzz/FuzzCarReader/5857e57e4072c6b0d8684030cc13b5570849c89b3dbf5bc0152abc66c9642f3e @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte(":\xa2eroots\x81\xd80X%\x00\x0100 00000000000000000000000000000000gversion\x01\x010") diff --git a/ipld/car/util/util.go b/ipld/car/util/util.go new file mode 100644 index 0000000000..af2f38e420 --- /dev/null +++ b/ipld/car/util/util.go @@ -0,0 +1,136 @@ +package util + +import ( + "bufio" + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + + cid "github.com/ipfs/go-cid" + mh "github.com/multiformats/go-multihash" +) + +// MaxAllowedSectionSize dictates the maximum number of bytes that a CARv1 header +// or section is allowed to occupy without causing a decode to error. +// This cannot be supplied as an option, only adjusted as a global. You should +// use v2#NewReader instead since it allows for options to be passed in. +var MaxAllowedSectionSize uint = 32 << 20 // 32MiB + +var cidv0Pref = []byte{0x12, 0x20} + +type BytesReader interface { + io.Reader + io.ByteReader +} + +// Deprecated: ReadCid shouldn't be used directly, use CidFromReader from go-cid +func ReadCid(buf []byte) (cid.Cid, int, error) { + if len(buf) >= 2 && bytes.Equal(buf[:2], cidv0Pref) { + i := 34 + if len(buf) < i { + i = len(buf) + } + c, err := cid.Cast(buf[:i]) + return c, i, err + } + + br := bytes.NewReader(buf) + + // assume cidv1 + vers, err := binary.ReadUvarint(br) + if err != nil { + return cid.Cid{}, 0, err + } + + // TODO: the go-cid package allows version 0 here as well + if vers != 1 { + return cid.Cid{}, 0, fmt.Errorf("invalid cid version number") + } + + codec, err := binary.ReadUvarint(br) + if err != nil { + return cid.Cid{}, 0, err + } + + mhr := mh.NewReader(br) + h, err := mhr.ReadMultihash() + if err != nil { + return cid.Cid{}, 0, err + } + + return cid.NewCidV1(codec, h), len(buf) - br.Len(), nil +} + +func ReadNode(br *bufio.Reader) (cid.Cid, []byte, error) { + data, err := LdRead(br) + if err != nil { + return cid.Cid{}, nil, err + } + + n, c, err := cid.CidFromReader(bytes.NewReader(data)) + if err != nil { + return cid.Cid{}, nil, err + } + + return c, data[n:], nil +} + +func LdWrite(w io.Writer, d ...[]byte) error { + var sum uint64 + for _, s := range d { + sum += uint64(len(s)) + } + + buf := make([]byte, 8) + n := binary.PutUvarint(buf, sum) + _, err := w.Write(buf[:n]) + if err != nil { + return err + } + + for _, s := range d { + _, err = w.Write(s) + if err != nil { + return err + } + } + + return nil +} + +func LdSize(d ...[]byte) uint64 { + var sum uint64 + for _, s := range d { + sum += uint64(len(s)) + } + buf := make([]byte, 8) + n := binary.PutUvarint(buf, sum) + return sum + uint64(n) +} + +func LdRead(r *bufio.Reader) ([]byte, error) { + if _, err := r.Peek(1); err != nil { // no more blocks, likely clean io.EOF + return nil, err + } + + l, err := binary.ReadUvarint(r) + if err != nil { + if err == io.EOF { + return nil, io.ErrUnexpectedEOF // don't silently pretend this is a clean EOF + } + return nil, err + } + + if l > uint64(MaxAllowedSectionSize) { // Don't OOM + return nil, errors.New("malformed car; header is bigger than util.MaxAllowedSectionSize") + } + + buf := make([]byte, l) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + + return buf, nil +} diff --git a/ipld/car/util/util_test.go b/ipld/car/util/util_test.go new file mode 100644 index 0000000000..3f8f59af46 --- /dev/null +++ b/ipld/car/util/util_test.go @@ -0,0 +1,27 @@ +package util_test + +import ( + "bytes" + crand "crypto/rand" + "math/rand" + "testing" + + "github.com/ipld/go-car/util" + "github.com/stretchr/testify/require" +) + +func TestLdSize(t *testing.T) { + for i := 0; i < 5; i++ { + var buf bytes.Buffer + data := make([][]byte, 5) + for j := 0; j < 5; j++ { + data[j] = make([]byte, rand.Intn(30)) + _, err := crand.Read(data[j]) + require.NoError(t, err) + } + size := util.LdSize(data...) + err := util.LdWrite(&buf, data...) + require.NoError(t, err) + require.Equal(t, uint64(len(buf.Bytes())), size) + } +} diff --git a/ipld/car/v2/LICENSE.md b/ipld/car/v2/LICENSE.md new file mode 100644 index 0000000000..2fa16a1537 --- /dev/null +++ b/ipld/car/v2/LICENSE.md @@ -0,0 +1,229 @@ +The contents of this repository are Copyright (c) corresponding authors and +contributors, licensed under the `Permissive License Stack` meaning either of: + +- Apache-2.0 Software License: https://www.apache.org/licenses/LICENSE-2.0 + ([...4tr2kfsq](https://dweb.link/ipfs/bafkreiankqxazcae4onkp436wag2lj3ccso4nawxqkkfckd6cg4tr2kfsq)) + +- MIT Software License: https://opensource.org/licenses/MIT + ([...vljevcba](https://dweb.link/ipfs/bafkreiepofszg4gfe2gzuhojmksgemsub2h4uy2gewdnr35kswvljevcba)) + +You may not use the contents of this repository except in compliance +with one of the listed Licenses. For an extended clarification of the +intent behind the choice of Licensing please refer to +https://protocol.ai/blog/announcing-the-permissive-license-stack/ + +Unless required by applicable law or agreed to in writing, software +distributed under the terms listed in this notice is distributed on +an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +either express or implied. See each License for the specific language +governing permissions and limitations under that License. + + +`SPDX-License-Identifier: Apache-2.0 OR MIT` + +Verbatim copies of both licenses are included below: + +
Apache-2.0 Software License + +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS +``` +
+ +
MIT Software License + +``` +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +``` +
diff --git a/ipld/car/v2/bench_test.go b/ipld/car/v2/bench_test.go new file mode 100644 index 0000000000..2f7dcc63d6 --- /dev/null +++ b/ipld/car/v2/bench_test.go @@ -0,0 +1,203 @@ +package car_test + +import ( + "context" + "io" + "math/rand" + "os" + "path/filepath" + "testing" + + "github.com/ipfs/go-cid" + "github.com/ipfs/go-merkledag" + "github.com/ipld/go-car/v2/blockstore" + + carv2 "github.com/ipld/go-car/v2" +) + +// BenchmarkReadBlocks instantiates a BlockReader, and iterates over all blocks. +// It essentially looks at the contents of any CARv1 or CARv2 file. +// Note that this also uses internal carv1.ReadHeader underneath. +func BenchmarkReadBlocks(b *testing.B) { + path := "testdata/sample-wrapped-v2.car" + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + r, err := os.Open("testdata/sample-wrapped-v2.car") + if err != nil { + b.Fatal(err) + } + br, err := carv2.NewBlockReader(r) + if err != nil { + b.Fatal(err) + } + for { + _, err := br.Next() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + } + + if err := r.Close(); err != nil { + b.Fatal(err) + } + } + }) +} + +// BenchmarkExtractV1File extracts inner CARv1 payload from a sample CARv2 file using ExtractV1File. +func BenchmarkExtractV1File(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10<<20) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + dstPath := filepath.Join(b.TempDir(), "destination.car") + for pb.Next() { + err = carv2.ExtractV1File(path, dstPath) + if err != nil { + b.Fatal(err) + } + _ = os.Remove(dstPath) + } + }) +} + +// BenchmarkExtractV1UsingReader extracts inner CARv1 payload from a sample CARv2 file using Reader +// API. This benchmark is implemented to be used as a comparison in conjunction with +// BenchmarkExtractV1File. +func BenchmarkExtractV1UsingReader(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10<<20) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + dstPath := filepath.Join(b.TempDir(), "destination.car") + for pb.Next() { + dst, err := os.Create(dstPath) + if err != nil { + b.Fatal(err) + } + reader, err := carv2.OpenReader(path) + if err != nil { + b.Fatal(err) + } + dr, err := reader.DataReader() + if err != nil { + b.Fatal(err) + } + _, err = io.Copy(dst, dr) + if err != nil { + b.Fatal(err) + } + if err := dst.Close(); err != nil { + b.Fatal(err) + } + } + }) +} + +// BenchmarkReader_InspectWithBlockValidation benchmarks Reader.Inspect with block hash validation +// for a randomly generated CARv2 file of size 10 MiB. +func BenchmarkReader_InspectWithBlockValidation(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10<<20) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + benchmarkInspect(b, path, true) + } + }) +} + +// BenchmarkReader_InspectWithoutBlockValidation benchmarks Reader.Inspect without block hash +// validation for a randomly generated CARv2 file of size 10 MiB. +func BenchmarkReader_InspectWithoutBlockValidation(b *testing.B) { + path := filepath.Join(b.TempDir(), "bench-large-v2.car") + generateRandomCarV2File(b, path, 10<<20) // 10 MiB + defer os.Remove(path) + + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + benchmarkInspect(b, path, false) + } + }) +} + +func benchmarkInspect(b *testing.B, path string, validateBlockHash bool) { + reader, err := carv2.OpenReader(path) + if err != nil { + b.Fatal(err) + } + if _, err := reader.Inspect(validateBlockHash); err != nil { + b.Fatal(err) + } +} +func generateRandomCarV2File(b *testing.B, path string, minTotalBlockSize int) { + // Use fixed RNG for determinism across benchmarks. + rng := rand.New(rand.NewSource(1413)) + bs, err := blockstore.OpenReadWrite(path, []cid.Cid{}) + defer func() { + if err := bs.Finalize(); err != nil { + b.Fatal(err) + } + }() + if err != nil { + b.Fatal(err) + } + buf := make([]byte, 32<<10) // 32 KiB + var totalBlockSize int + for totalBlockSize < minTotalBlockSize { + size, err := rng.Read(buf) + if err != nil { + b.Fatal(err) + } + + blk := merkledag.NewRawNode(buf) + if err := bs.Put(context.TODO(), blk); err != nil { + b.Fatal(err) + } + totalBlockSize += size + } +} diff --git a/ipld/car/v2/block_reader.go b/ipld/car/v2/block_reader.go new file mode 100644 index 0000000000..fff477a6f1 --- /dev/null +++ b/ipld/car/v2/block_reader.go @@ -0,0 +1,221 @@ +package car + +import ( + "fmt" + "io" + + "github.com/ipfs/go-cid" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/carv1/util" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-varint" +) + +// BlockReader facilitates iteration over CAR blocks for both CARv1 and CARv2. +// See NewBlockReader +type BlockReader struct { + // The detected version of the CAR payload. + Version uint64 + // The roots of the CAR payload. May be empty. + Roots []cid.Cid + + // Used internally only, by BlockReader.Next during iteration over blocks. + r io.Reader + offset uint64 + readerSize int64 + opts Options +} + +// NewBlockReader instantiates a new BlockReader facilitating iteration over blocks in CARv1 or +// CARv2 payload. Upon instantiation, the version is automatically detected and exposed via +// BlockReader.Version. The root CIDs of the CAR payload are exposed via BlockReader.Roots +// +// See BlockReader.Next +func NewBlockReader(r io.Reader, opts ...Option) (*BlockReader, error) { + options := ApplyOptions(opts...) + + // Read CARv1 header or CARv2 pragma. + // Both are a valid CARv1 header, therefore are read as such. + pragmaOrV1Header, err := carv1.ReadHeader(r, options.MaxAllowedHeaderSize) + if err != nil { + return nil, err + } + + // Populate the block reader version and options. + br := &BlockReader{ + Version: pragmaOrV1Header.Version, + opts: options, + } + + // Expect either version 1 or 2. + switch br.Version { + case 1: + // If version is 1, r represents a CARv1. + // Simply populate br.Roots and br.r without modifying r. + br.Roots = pragmaOrV1Header.Roots + br.r = r + br.readerSize = -1 + br.offset, _ = carv1.HeaderSize(pragmaOrV1Header) + case 2: + // If the version is 2: + // 1. Read CARv2 specific header to locate the inner CARv1 data payload offset and size. + // 2. Skip to the beginning of the inner CARv1 data payload. + // 3. Re-initialize r as a LimitReader, limited to the size of the inner CARv1 payload. + // 4. Read the header of inner CARv1 data payload via r to populate br.Roots. + + // Read CARv2-specific header. + v2h := Header{} + if _, err := v2h.ReadFrom(r); err != nil { + return nil, err + } + + // Skip to the beginning of inner CARv1 data payload. + // Note, at this point the pragma and CARv1 header have been read. + // An io.ReadSeeker is opportunistically constructed from the given io.Reader r. + // The constructor does not take an initial offset, so we use Seek in io.SeekCurrent to + // fast forward to the beginning of data payload by subtracting pragma and header size from + // dataOffset. + rs := internalio.ToByteReadSeeker(r) + if _, err := rs.Seek(int64(v2h.DataOffset)-PragmaSize-HeaderSize, io.SeekCurrent); err != nil { + return nil, err + } + br.offset = uint64(v2h.DataOffset) + br.readerSize = int64(v2h.DataOffset + v2h.DataSize) + + // Set br.r to a LimitReader reading from r limited to dataSize. + br.r = io.LimitReader(r, int64(v2h.DataSize)) + + // Populate br.Roots by reading the inner CARv1 data payload header. + header, err := carv1.ReadHeader(br.r, options.MaxAllowedHeaderSize) + if err != nil { + return nil, err + } + // Assert that the data payload header is exactly 1, i.e. the header represents a CARv1. + if header.Version != 1 { + return nil, fmt.Errorf("invalid data payload header version; expected 1, got %v", header.Version) + } + br.Roots = header.Roots + default: + // Otherwise, error out with invalid version since only versions 1 or 2 are expected. + return nil, fmt.Errorf("invalid car version: %d", br.Version) + } + return br, nil +} + +// Next iterates over blocks in the underlying CAR payload with an io.EOF error indicating the end +// is reached. Note, this function is forward-only; once the end has been reached it will always +// return io.EOF. +// +// When the payload represents a CARv1 the BlockReader.Next simply iterates over blocks until it +// reaches the end of the underlying io.Reader stream. +// +// As for CARv2 payload, the underlying io.Reader is read only up to the end of the last block. +// Note, in a case where ZeroLengthSectionAsEOF Option is enabled, io.EOF is returned +// immediately upon encountering a zero-length section without reading any further bytes from the +// underlying io.Reader. +func (br *BlockReader) Next() (blocks.Block, error) { + c, data, err := util.ReadNode(br.r, br.opts.ZeroLengthSectionAsEOF, br.opts.MaxAllowedSectionSize) + if err != nil { + return nil, err + } + + if !br.opts.TrustedCAR { + hashed, err := c.Prefix().Sum(data) + if err != nil { + return nil, err + } + + if !hashed.Equals(c) { + return nil, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", c, hashed) + } + } + + ss := uint64(c.ByteLen()) + uint64(len(data)) + br.offset += uint64(varint.UvarintSize(ss)) + ss + return blocks.NewBlockWithCid(data, c) +} + +type BlockMetadata struct { + cid.Cid + Offset uint64 + Size uint64 +} + +// SkipNext jumps over the next block, returning metadata about what it is (the CID, offset, and size). +// Like Next it will return an io.EOF once it has reached the end. +// +// If the underlying reader used by the BlockReader is actually a ReadSeeker, this method will attempt to +// seek over the underlying data rather than reading it into memory. +func (br *BlockReader) SkipNext() (*BlockMetadata, error) { + sctSize, err := util.LdReadSize(br.r, br.opts.ZeroLengthSectionAsEOF, br.opts.MaxAllowedSectionSize) + if err != nil { + return nil, err + } + + if sctSize == 0 { + _, _, err := cid.CidFromBytes([]byte{}) + return nil, err + } + + cidSize, c, err := cid.CidFromReader(io.LimitReader(br.r, int64(sctSize))) + if err != nil { + return nil, err + } + + blkSize := sctSize - uint64(cidSize) + if brs, ok := br.r.(io.ReadSeeker); ok { + // carv1 and we don't know the size, so work it out and cache it + if br.readerSize == -1 { + cur, err := brs.Seek(0, io.SeekCurrent) + if err != nil { + return nil, err + } + end, err := brs.Seek(0, io.SeekEnd) + if err != nil { + return nil, err + } + br.readerSize = end + if _, err = brs.Seek(cur, io.SeekStart); err != nil { + return nil, err + } + } + // seek. + finalOffset, err := brs.Seek(int64(blkSize), io.SeekCurrent) + if err != nil { + return nil, err + } + if finalOffset != int64(br.offset)+int64(sctSize)+int64(varint.UvarintSize(sctSize)) { + return nil, fmt.Errorf("unexpected length") + } + if finalOffset > br.readerSize { + return nil, io.ErrUnexpectedEOF + } + br.offset = uint64(finalOffset) + return &BlockMetadata{ + c, + uint64(finalOffset) - sctSize - uint64(varint.UvarintSize(sctSize)), + blkSize, + }, nil + } + + // read to end. + readCnt, err := io.CopyN(io.Discard, br.r, int64(blkSize)) + if err != nil { + if err == io.EOF { + return nil, io.ErrUnexpectedEOF + } + return nil, err + } + if readCnt != int64(blkSize) { + return nil, fmt.Errorf("unexpected length") + } + origOffset := br.offset + br.offset += uint64(varint.UvarintSize(sctSize)) + sctSize + + return &BlockMetadata{ + c, + origOffset, + blkSize, + }, nil +} diff --git a/ipld/car/v2/block_reader_test.go b/ipld/car/v2/block_reader_test.go new file mode 100644 index 0000000000..ca6c2404d6 --- /dev/null +++ b/ipld/car/v2/block_reader_test.go @@ -0,0 +1,237 @@ +package car_test + +import ( + "bytes" + "encoding/hex" + "fmt" + "io" + "os" + "testing" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/internal/carv1" + mh "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" + "github.com/stretchr/testify/require" +) + +func TestBlockReaderFailsOnUnknownVersion(t *testing.T) { + r := requireReaderFromPath(t, "testdata/sample-rootless-v42.car") + _, err := carv2.NewBlockReader(r) + require.EqualError(t, err, "invalid car version: 42") +} + +func TestBlockReaderFailsOnCorruptPragma(t *testing.T) { + r := requireReaderFromPath(t, "testdata/sample-corrupt-pragma.car") + _, err := carv2.NewBlockReader(r) + require.EqualError(t, err, "unexpected EOF") +} + +func TestBlockReader_WithCarV1Consistency(t *testing.T) { + tests := []struct { + name string + path string + zerLenAsEOF bool + wantVersion uint64 + }{ + { + name: "CarV1WithoutZeroLengthSection", + path: "testdata/sample-v1.car", + wantVersion: 1, + }, + { + name: "CarV1WithZeroLenSection", + path: "testdata/sample-v1-with-zero-len-section.car", + zerLenAsEOF: true, + wantVersion: 1, + }, + { + name: "AnotherCarV1WithZeroLenSection", + path: "testdata/sample-v1-with-zero-len-section2.car", + zerLenAsEOF: true, + wantVersion: 1, + }, + { + name: "CarV1WithZeroLenSectionWithoutOption", + path: "testdata/sample-v1-with-zero-len-section.car", + wantVersion: 1, + }, + { + name: "AnotherCarV1WithZeroLenSectionWithoutOption", + path: "testdata/sample-v1-with-zero-len-section2.car", + wantVersion: 1, + }, + { + name: "CorruptCarV1", + path: "testdata/sample-v1-tailing-corrupt-section.car", + wantVersion: 1, + }, + { + name: "CarV2WrappingV1", + path: "testdata/sample-wrapped-v2.car", + wantVersion: 2, + }, + { + name: "CarV2ProducedByBlockstore", + path: "testdata/sample-rw-bs-v2.car", + wantVersion: 2, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + r := requireReaderFromPath(t, tt.path) + subject, err := carv2.NewBlockReader(r, carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + + require.Equal(t, tt.wantVersion, subject.Version) + + var wantReader *carv1.CarReader + switch tt.wantVersion { + case 1: + wantReader = requireNewCarV1ReaderFromV1File(t, tt.path, tt.zerLenAsEOF) + case 2: + wantReader = requireNewCarV1ReaderFromV2File(t, tt.path, tt.zerLenAsEOF) + default: + require.Failf(t, "invalid test-case", "unknown wantVersion %v", tt.wantVersion) + } + require.Equal(t, wantReader.Header.Roots, subject.Roots) + + for { + gotBlock, gotErr := subject.Next() + wantBlock, wantErr := wantReader.Next() + require.Equal(t, wantBlock, gotBlock) + require.Equal(t, wantErr, gotErr) + if gotErr == io.EOF { + break + } + } + }) + t.Run(tt.name+"-skipping-reads", func(t *testing.T) { + r := requireReaderFromPath(t, tt.path) + subject, err := carv2.NewBlockReader(r, carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + + require.Equal(t, tt.wantVersion, subject.Version) + + var wantReader *carv1.CarReader + switch tt.wantVersion { + case 1: + wantReader = requireNewCarV1ReaderFromV1File(t, tt.path, tt.zerLenAsEOF) + case 2: + wantReader = requireNewCarV1ReaderFromV2File(t, tt.path, tt.zerLenAsEOF) + default: + require.Failf(t, "invalid test-case", "unknown wantVersion %v", tt.wantVersion) + } + require.Equal(t, wantReader.Header.Roots, subject.Roots) + + for { + gotBlock, gotErr := subject.SkipNext() + wantBlock, wantErr := wantReader.Next() + if wantErr != nil && gotErr == nil { + fmt.Printf("want was %+v\n", wantReader) + fmt.Printf("want was err, got was %+v / %d\n", gotBlock, gotBlock.Size) + } + require.Equal(t, wantErr, gotErr) + if gotErr == io.EOF { + break + } + if gotErr == nil { + require.Equal(t, wantBlock.Cid(), gotBlock.Cid) + require.Equal(t, uint64(len(wantBlock.RawData())), gotBlock.Size) + } + } + }) + } +} + +func TestMaxSectionLength(t *testing.T) { + // headerHex is the zero-roots CARv1 header + const headerHex = "11a265726f6f7473806776657273696f6e01" + headerBytes, _ := hex.DecodeString(headerHex) + // 8 MiB block of zeros + block := make([]byte, 8<<20) + // CID for that block + pfx := cid.NewPrefixV1(cid.Raw, mh.SHA2_256) + cid, err := pfx.Sum(block) + require.NoError(t, err) + + // construct CAR + var buf bytes.Buffer + buf.Write(headerBytes) + buf.Write(varint.ToUvarint(uint64(len(cid.Bytes()) + len(block)))) + buf.Write(cid.Bytes()) + buf.Write(block) + + // try to read it + car, err := carv2.NewBlockReader(bytes.NewReader(buf.Bytes())) + require.NoError(t, err) + // error should occur on first section read + _, err = car.Next() + require.EqualError(t, err, "invalid section data, length of read beyond allowable maximum") + + // successful read by expanding the max section size + car, err = carv2.NewBlockReader(bytes.NewReader(buf.Bytes()), carv2.MaxAllowedSectionSize((8<<20)+40)) + require.NoError(t, err) + // can now read block and get our 8 MiB zeroed byte array + readBlock, err := car.Next() + require.NoError(t, err) + require.True(t, bytes.Equal(block, readBlock.RawData())) +} + +func TestTrustedCAR(t *testing.T) { + // headerHex is the zero-roots CARv1 header + const headerHex = "11a265726f6f7473806776657273696f6e01" + headerBytes, _ := hex.DecodeString(headerHex) + // block of zeros + block := make([]byte, 5) + // CID for that block + pfx := cid.NewPrefixV1(cid.Raw, mh.SHA2_256) + cid, err := pfx.Sum(block) + require.NoError(t, err) + + // Modify the block so it won't match CID anymore + block[2] = 0xFF + // construct CAR + var buf bytes.Buffer + buf.Write(headerBytes) + buf.Write(varint.ToUvarint(uint64(len(cid.Bytes()) + len(block)))) + buf.Write(cid.Bytes()) + buf.Write(block) + + // try to read it as trusted + car, err := carv2.NewBlockReader(bytes.NewReader(buf.Bytes()), carv2.WithTrustedCAR(true)) + require.NoError(t, err) + _, err = car.Next() + require.NoError(t, err) + + // Try to read it as untrusted - should fail + car, err = carv2.NewBlockReader(bytes.NewReader(buf.Bytes()), carv2.WithTrustedCAR(false)) + require.NoError(t, err) + // error should occur on first section read + _, err = car.Next() + require.EqualError(t, err, "mismatch in content integrity, expected: bafkreieikviivlpbn3cxhuq6njef37ikoysaqxa2cs26zxleqxpay2bzuq, got: bafkreidgklrppelx4fxcsna7cxvo3g7ayedfojkqeuus6kz6e4hy7gukmy") +} + +func TestMaxHeaderLength(t *testing.T) { + // headerHex is the is a 5 root CARv1 header + const headerHex = "de01a265726f6f747385d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b501d82a58250001711220785197229dc8bb1152945da58e2348f7e279eeded06cc2ca736d0e879858b5016776657273696f6e01" + headerBytes, _ := hex.DecodeString(headerHex) + c, _ := cid.Decode("bafyreidykglsfhoixmivffc5uwhcgshx4j465xwqntbmu43nb2dzqwfvae") + + // successful read + car, err := carv2.NewBlockReader(bytes.NewReader(headerBytes)) + require.NoError(t, err) + require.ElementsMatch(t, []cid.Cid{c, c, c, c, c}, car.Roots) + + // unsuccessful read, low allowable max header length (length - 3 because there are 2 bytes in the length varint prefix) + _, err = carv2.NewBlockReader(bytes.NewReader(headerBytes), carv2.MaxAllowedHeaderSize(uint64(len(headerBytes)-3))) + require.EqualError(t, err, "invalid header data, length of read beyond allowable maximum") +} + +func requireReaderFromPath(t *testing.T, path string) io.Reader { + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + return f +} diff --git a/ipld/car/v2/blockstore/bench_test.go b/ipld/car/v2/blockstore/bench_test.go new file mode 100644 index 0000000000..57544b967b --- /dev/null +++ b/ipld/car/v2/blockstore/bench_test.go @@ -0,0 +1,80 @@ +package blockstore_test + +import ( + "context" + "io" + mathrand "math/rand" + "os" + "testing" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" +) + +// BenchmarkOpenReadOnlyV1 opens a read-only blockstore, +// and retrieves all blocks in a shuffled order. +// Note that this benchmark includes generating an index, +// since the input file is a CARv1. +func BenchmarkOpenReadOnlyV1(b *testing.B) { + path := "../testdata/sample-v1.car" + f, err := os.Open("../testdata/sample-v1.car") + if err != nil { + b.Fatal(err) + } + defer func() { + if err := f.Close(); err != nil { + b.Fatal(err) + } + }() + info, err := os.Stat(path) + if err != nil { + b.Fatal(err) + } + b.SetBytes(info.Size()) + b.ReportAllocs() + + var shuffledCIDs []cid.Cid + br, err := carv2.NewBlockReader(f) + if err != nil { + b.Fatal(err) + } + for { + block, err := br.Next() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + shuffledCIDs = append(shuffledCIDs, block.Cid()) + } + + // The shuffling needs to be deterministic, + // for the sake of stable benchmark results. + // Any source number works as long as it's fixed. + rnd := mathrand.New(mathrand.NewSource(123456)) + rnd.Shuffle(len(shuffledCIDs), func(i, j int) { + shuffledCIDs[i], shuffledCIDs[j] = shuffledCIDs[j], shuffledCIDs[i] + }) + + b.RunParallel(func(pb *testing.PB) { + for pb.Next() { + bs, err := blockstore.OpenReadOnly(path) + if err != nil { + b.Fatal(err) + } + + for _, c := range shuffledCIDs { + _, err := bs.Get(context.TODO(), c) + if err != nil { + b.Fatal(err) + } + } + + if err := bs.Close(); err != nil { + b.Fatal(err) + } + } + }) +} diff --git a/ipld/car/v2/blockstore/doc.go b/ipld/car/v2/blockstore/doc.go new file mode 100644 index 0000000000..4aa4cfdfc9 --- /dev/null +++ b/ipld/car/v2/blockstore/doc.go @@ -0,0 +1,28 @@ +// Package blockstore implements the IPFS blockstore interface backed by a CAR file. +// This package provides two flavours of blockstore: ReadOnly and ReadWrite. +// +// The ReadOnly blockstore provides a read-only random access from a given data payload either in +// unindexed CARv1 format or indexed/unindexed v2 format: +// - ReadOnly.NewReadOnly can be used to instantiate a new read-only blockstore for a given CARv1 +// or CARv2 data payload with an optional index override. +// - ReadOnly.OpenReadOnly can be used to instantiate a new read-only blockstore for a given CARv1 +// or CARv2 file with automatic index generation if the index is not present. +// +// The ReadWrite blockstore allows writing and reading of the blocks concurrently. The user of this +// blockstore is responsible for calling ReadWrite.Finalize when finished writing blocks. +// Upon finalization, the instance can no longer be used for reading or writing blocks and will +// error if used. To continue reading the blocks users are encouraged to use ReadOnly blockstore +// instantiated from the same file path using OpenReadOnly. +// A user may resume reading/writing from files produced by an instance of ReadWrite blockstore. The +// resumption is attempted automatically, if the path passed to OpenReadWrite exists. +// +// Note that the blockstore implementations in this package behave similarly to IPFS IdStore wrapper +// when given CIDs with multihash.IDENTITY code. +// More specifically, for CIDs with multhash.IDENTITY code: +// * blockstore.Has will always return true. +// * blockstore.Get will always succeed, returning the multihash digest of the given CID. +// * blockstore.GetSize will always succeed, returning the multihash digest length of the given CID. +// * blockstore.Put and blockstore.PutMany will always succeed without performing any operation unless car.StoreIdentityCIDs is enabled. +// +// See: https://pkg.go.dev/github.com/ipfs/go-ipfs-blockstore#NewIdStore +package blockstore diff --git a/ipld/car/v2/blockstore/example_test.go b/ipld/car/v2/blockstore/example_test.go new file mode 100644 index 0000000000..9c7524a9c3 --- /dev/null +++ b/ipld/car/v2/blockstore/example_test.go @@ -0,0 +1,159 @@ +package blockstore_test + +import ( + "context" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/ipfs/go-cid" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-merkledag" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" +) + +const cidPrintCount = 5 + +// ExampleOpenReadOnly opens a read-only blockstore from a CARv1 file, and prints its root CIDs +// along with CID mapping to raw data size of blocks for first five sections in the CAR file. +func ExampleOpenReadOnly() { + // Open a new ReadOnly blockstore from a CARv1 file. + // Note, `OpenReadOnly` accepts bot CARv1 and CARv2 formats and transparently generate index + // in the background if necessary. + // This instance sets ZeroLengthSectionAsEOF option to treat zero sized sections in file as EOF. + robs, err := blockstore.OpenReadOnly("../testdata/sample-v1.car", + blockstore.UseWholeCIDs(true), + carv2.ZeroLengthSectionAsEOF(true), + ) + if err != nil { + panic(err) + } + defer robs.Close() + + // Print root CIDs. + roots, err := robs.Roots() + if err != nil { + panic(err) + } + fmt.Printf("Contains %v root CID(s):\n", len(roots)) + for _, r := range roots { + fmt.Printf("\t%v\n", r) + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Print the raw data size for the first 5 CIDs in the CAR file. + keysChan, err := robs.AllKeysChan(ctx) + if err != nil { + panic(err) + } + fmt.Printf("List of first %v CIDs and their raw data size:\n", cidPrintCount) + i := 1 + for k := range keysChan { + if i > cidPrintCount { + cancel() + break + } + size, err := robs.GetSize(context.TODO(), k) + if err != nil { + panic(err) + } + fmt.Printf("\t%v -> %v bytes\n", k, size) + i++ + } + + // Output: + // Contains 1 root CID(s): + // bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy + // List of first 5 CIDs and their raw data size: + // bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy -> 821 bytes + // bafy2bzaceaycv7jhaegckatnncu5yugzkrnzeqsppzegufr35lroxxnsnpspu -> 1053 bytes + // bafy2bzaceb62wdepofqu34afqhbcn4a7jziwblt2ih5hhqqm6zitd3qpzhdp4 -> 1094 bytes + // bafy2bzaceb3utcspm5jqcdqpih3ztbaztv7yunzkiyfq7up7xmokpxemwgu5u -> 1051 bytes + // bafy2bzacedjwekyjresrwjqj4n2r5bnuuu3klncgjo2r3slsp6wgqb37sz4ck -> 821 bytes +} + +// ExampleOpenReadWrite creates a read-write blockstore and puts +func ExampleOpenReadWrite() { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + thisBlock := merkledag.NewRawNode([]byte("fish")).Block + thatBlock := merkledag.NewRawNode([]byte("lobster")).Block + andTheOtherBlock := merkledag.NewRawNode([]byte("barreleye")).Block + + tdir, err := os.MkdirTemp(os.TempDir(), "example-*") + if err != nil { + panic(err) + } + dst := filepath.Join(tdir, "sample-rw-bs-v2.car") + roots := []cid.Cid{thisBlock.Cid(), thatBlock.Cid(), andTheOtherBlock.Cid()} + + rwbs, err := blockstore.OpenReadWrite(dst, roots, carv2.UseDataPadding(1413), carv2.UseIndexPadding(42)) + if err != nil { + panic(err) + } + + // Put all blocks onto the blockstore. + blocks := []blocks.Block{thisBlock, thatBlock} + if err := rwbs.PutMany(ctx, blocks); err != nil { + panic(err) + } + fmt.Printf("Successfully wrote %v blocks into the blockstore.\n", len(blocks)) + + // Any blocks put can be read back using the same blockstore instance. + block, err := rwbs.Get(ctx, thatBlock.Cid()) + if err != nil { + panic(err) + } + fmt.Printf("Read back block just put with raw value of `%v`.\n", string(block.RawData())) + + // Finalize the blockstore to flush out the index and make a complete CARv2. + if err := rwbs.Finalize(); err != nil { + panic(err) + } + + // Resume from the same file to add more blocks. + // Note the UseDataPadding and roots must match the values passed to the blockstore instance + // that created the original file. Otherwise, we cannot resume from the same file. + resumedRwbos, err := blockstore.OpenReadWrite(dst, roots, carv2.UseDataPadding(1413)) + if err != nil { + panic(err) + } + + // Put another block, appending it to the set of blocks that are written previously. + if err := resumedRwbos.Put(ctx, andTheOtherBlock); err != nil { + panic(err) + } + + // Read back the the block put before resumption. + // Blocks previously put are present. + block, err = resumedRwbos.Get(ctx, thatBlock.Cid()) + if err != nil { + panic(err) + } + fmt.Printf("Resumed blockstore contains blocks put previously with raw value of `%v`.\n", string(block.RawData())) + + // Put an additional block to the CAR. + // Blocks put after resumption are also present. + block, err = resumedRwbos.Get(ctx, andTheOtherBlock.Cid()) + if err != nil { + panic(err) + } + fmt.Printf("It also contains the block put after resumption with raw value of `%v`.\n", string(block.RawData())) + + // Finalize the blockstore to flush out the index and make a complete CARv2. + // Note, Finalize must be called on an open ReadWrite blockstore to flush out a complete CARv2. + if err := resumedRwbos.Finalize(); err != nil { + panic(err) + } + + // Output: + // Successfully wrote 2 blocks into the blockstore. + // Read back block just put with raw value of `lobster`. + // Resumed blockstore contains blocks put previously with raw value of `lobster`. + // It also contains the block put after resumption with raw value of `barreleye`. +} diff --git a/ipld/car/v2/blockstore/readonly.go b/ipld/car/v2/blockstore/readonly.go new file mode 100644 index 0000000000..a102b20c4d --- /dev/null +++ b/ipld/car/v2/blockstore/readonly.go @@ -0,0 +1,474 @@ +package blockstore + +import ( + "context" + "errors" + "fmt" + "io" + "sync" + + "github.com/ipfs/go-cid" + blockstore "github.com/ipfs/go-ipfs-blockstore" + format "github.com/ipfs/go-ipld-format" + blocks "github.com/ipfs/go-libipfs/blocks" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/ipld/go-car/v2/internal/store" + "github.com/multiformats/go-varint" + "golang.org/x/exp/mmap" +) + +var _ blockstore.Blockstore = (*ReadOnly)(nil) + +var ( + errZeroLengthSection = fmt.Errorf("zero-length carv2 section not allowed by default; see WithZeroLengthSectionAsEOF option") + errReadOnly = fmt.Errorf("called write method on a read-only carv2 blockstore") + errClosed = fmt.Errorf("cannot use a carv2 blockstore after closing") +) + +// ReadOnly provides a read-only CAR Block Store. +type ReadOnly struct { + // mu allows ReadWrite to be safe for concurrent use. + // It's in ReadOnly so that read operations also grab read locks, + // given that ReadWrite embeds ReadOnly for methods like Get and Has. + // + // The main fields guarded by the mutex are the index and the underlying writers. + // For simplicity, the entirety of the blockstore methods grab the mutex. + mu sync.RWMutex + + // When true, the blockstore has been closed via Close, Discard, or + // Finalize, and must not be used. Any further blockstore method calls + // will return errClosed to avoid panics or broken behavior. + closed bool + + // The backing containing the data payload in CARv1 format. + backing io.ReaderAt + + // The CARv1 content index. + idx index.Index + + // If we called carv2.NewReaderMmap, remember to close it too. + carv2Closer io.Closer + + opts carv2.Options +} + +type contextKey string + +const asyncErrHandlerKey contextKey = "asyncErrorHandlerKey" + +var UseWholeCIDs = carv2.UseWholeCIDs + +// NewReadOnly creates a new ReadOnly blockstore from the backing with a optional index as idx. +// This function accepts both CARv1 and CARv2 backing. +// The blockstore is instantiated with the given index if it is not nil. +// +// Otherwise: +// * For a CARv1 backing an index is generated. +// * For a CARv2 backing an index is only generated if Header.HasIndex returns false. +// +// There is no need to call ReadOnly.Close on instances returned by this function. +func NewReadOnly(backing io.ReaderAt, idx index.Index, opts ...carv2.Option) (*ReadOnly, error) { + b := &ReadOnly{ + opts: carv2.ApplyOptions(opts...), + } + + version, err := readVersion(backing, opts...) + if err != nil { + return nil, err + } + switch version { + case 1: + if idx == nil { + if idx, err = generateIndex(backing, opts...); err != nil { + return nil, err + } + } + b.backing = backing + b.idx = idx + return b, nil + case 2: + v2r, err := carv2.NewReader(backing, opts...) + if err != nil { + return nil, err + } + if idx == nil { + if v2r.Header.HasIndex() { + ir, err := v2r.IndexReader() + if err != nil { + return nil, err + } + idx, err = index.ReadFrom(ir) + if err != nil { + return nil, err + } + } else { + dr, err := v2r.DataReader() + if err != nil { + return nil, err + } + if idx, err = generateIndex(dr, opts...); err != nil { + return nil, err + } + } + } + b.backing, err = v2r.DataReader() + if err != nil { + return nil, err + } + b.idx = idx + return b, nil + default: + return nil, fmt.Errorf("unsupported car version: %v", version) + } +} + +func readVersion(at io.ReaderAt, opts ...carv2.Option) (uint64, error) { + var rr io.Reader + switch r := at.(type) { + case io.Reader: + rr = r + default: + var err error + rr, err = internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return 0, err + } + } + return carv2.ReadVersion(rr, opts...) +} + +func generateIndex(at io.ReaderAt, opts ...carv2.Option) (index.Index, error) { + var rs io.ReadSeeker + switch r := at.(type) { + case io.ReadSeeker: + rs = r + // The version may have been read from the given io.ReaderAt; therefore move back to the begining. + if _, err := rs.Seek(0, io.SeekStart); err != nil { + return nil, err + } + default: + var err error + rs, err = internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return nil, err + } + } + + // Note, we do not set any write options so that all write options fall back onto defaults. + return carv2.GenerateIndex(rs, opts...) +} + +// OpenReadOnly opens a read-only blockstore from a CAR file (either v1 or v2), generating an index if it does not exist. +// Note, the generated index if the index does not exist is ephemeral and only stored in memory. +// See car.GenerateIndex and Index.Attach for persisting index onto a CAR file. +func OpenReadOnly(path string, opts ...carv2.Option) (*ReadOnly, error) { + f, err := mmap.Open(path) + if err != nil { + return nil, err + } + + robs, err := NewReadOnly(f, nil, opts...) + if err != nil { + return nil, err + } + robs.carv2Closer = f + + return robs, nil +} + +// Index gives direct access to the index. +// You should never add records on your own there. +func (b *ReadOnly) Index() index.Index { + return b.idx +} + +// DeleteBlock is unsupported and always errors. +func (b *ReadOnly) DeleteBlock(_ context.Context, _ cid.Cid) error { + return errReadOnly +} + +// Has indicates if the store contains a block that corresponds to the given key. +// This function always returns true for any given key with multihash.IDENTITY +// code unless the StoreIdentityCIDs option is on, in which case it will defer +// to the index to check for the existence of the block; the index may or may +// not contain identity CIDs included in this CAR, depending on whether +// StoreIdentityCIDs was on when the index was created. If the CAR is a CARv1 +// and StoreIdentityCIDs is on, then the index will contain identity CIDs and +// this will always return true. +func (b *ReadOnly) Has(ctx context.Context, key cid.Cid) (bool, error) { + if !b.opts.StoreIdentityCIDs { + // If we don't store identity CIDs then we can return them straight away as if they are here, + // otherwise we need to check for their existence. + // Note, we do this without locking, since there is no shared information to lock for in order to perform the check. + if _, ok, err := store.IsIdentity(key); err != nil { + return false, err + } else if ok { + return true, nil + } + } + + b.mu.RLock() + defer b.mu.RUnlock() + + if b.closed { + return false, errClosed + } + + _, _, size, err := store.FindCid( + b.backing, + b.idx, + key, + b.opts.BlockstoreUseWholeCIDs, + b.opts.ZeroLengthSectionAsEOF, + b.opts.MaxAllowedSectionSize, + false, + ) + if errors.Is(err, index.ErrNotFound) { + return false, nil + } else if err != nil { + return false, err + } + return size > -1, nil +} + +// Get gets a block corresponding to the given key. +// This function always returns the block for any given key with +// multihash.IDENTITY code unless the StoreIdentityCIDs option is on, in which +// case it will defer to the index to check for the existence of the block; the +// index may or may not contain identity CIDs included in this CAR, depending on +// whether StoreIdentityCIDs was on when the index was created. If the CAR is a +// CARv1 and StoreIdentityCIDs is on, then the index will contain identity CIDs +// and this will always return true. +func (b *ReadOnly) Get(ctx context.Context, key cid.Cid) (blocks.Block, error) { + if !b.opts.StoreIdentityCIDs { + // If we don't store identity CIDs then we can return them straight away as if they are here, + // otherwise we need to check for their existence. + // Note, we do this without locking, since there is no shared information to lock for in order to perform the check. + if digest, ok, err := store.IsIdentity(key); err != nil { + return nil, err + } else if ok { + return blocks.NewBlockWithCid(digest, key) + } + } + + b.mu.RLock() + defer b.mu.RUnlock() + + if b.closed { + return nil, errClosed + } + + data, _, _, err := store.FindCid( + b.backing, + b.idx, + key, + b.opts.BlockstoreUseWholeCIDs, + b.opts.ZeroLengthSectionAsEOF, + b.opts.MaxAllowedSectionSize, + true, + ) + if errors.Is(err, index.ErrNotFound) { + return nil, format.ErrNotFound{Cid: key} + } else if err != nil { + return nil, err + } + return blocks.NewBlockWithCid(data, key) +} + +// GetSize gets the size of an item corresponding to the given key. +func (b *ReadOnly) GetSize(ctx context.Context, key cid.Cid) (int, error) { + // Check if the given CID has multihash.IDENTITY code + // Note, we do this without locking, since there is no shared information to lock for in order to perform the check. + if digest, ok, err := store.IsIdentity(key); err != nil { + return 0, err + } else if ok { + return len(digest), nil + } + + b.mu.RLock() + defer b.mu.RUnlock() + + if b.closed { + return 0, errClosed + } + + _, _, size, err := store.FindCid( + b.backing, + b.idx, + key, + b.opts.BlockstoreUseWholeCIDs, + b.opts.ZeroLengthSectionAsEOF, + b.opts.MaxAllowedSectionSize, + false, + ) + if errors.Is(err, index.ErrNotFound) { + return -1, format.ErrNotFound{Cid: key} + } else if err != nil { + return -1, err + } + return size, nil +} + +// Put is not supported and always returns an error. +func (b *ReadOnly) Put(context.Context, blocks.Block) error { + return errReadOnly +} + +// PutMany is not supported and always returns an error. +func (b *ReadOnly) PutMany(context.Context, []blocks.Block) error { + return errReadOnly +} + +// WithAsyncErrorHandler returns a context with async error handling set to the given errHandler. +// Any errors that occur during asynchronous operations of AllKeysChan will be passed to the given +// handler. +func WithAsyncErrorHandler(ctx context.Context, errHandler func(error)) context.Context { + return context.WithValue(ctx, asyncErrHandlerKey, errHandler) +} + +// AllKeysChan returns the list of keys in the CAR data payload. +// If the ctx is constructed using WithAsyncErrorHandler any errors that occur during asynchronous +// retrieval of CIDs will be passed to the error handler function set in context. +// Otherwise, errors will terminate the asynchronous operation silently. +// +// See WithAsyncErrorHandler +func (b *ReadOnly) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) { + // We release the lock when the channel-sending goroutine stops. + // Note that we can't use a deferred unlock here, + // because if we return a nil error, + // we only want to unlock once the async goroutine has stopped. + b.mu.RLock() + + if b.closed { + b.mu.RUnlock() // don't hold the mutex forever + return nil, errClosed + } + + // TODO we may use this walk for populating the index, and we need to be able to iterate keys in this way somewhere for index generation. In general though, when it's asked for all keys from a blockstore with an index, we should iterate through the index when possible rather than linear reads through the full car. + rdr, err := internalio.NewOffsetReadSeeker(b.backing, 0) + if err != nil { + return nil, err + } + header, err := carv1.ReadHeader(rdr, b.opts.MaxAllowedHeaderSize) + if err != nil { + b.mu.RUnlock() // don't hold the mutex forever + return nil, fmt.Errorf("error reading car header: %w", err) + } + headerSize, err := carv1.HeaderSize(header) + if err != nil { + b.mu.RUnlock() // don't hold the mutex forever + return nil, err + } + + // TODO: document this choice of 5, or use simpler buffering like 0 or 1. + ch := make(chan cid.Cid, 5) + + // Seek to the end of header. + if _, err = rdr.Seek(int64(headerSize), io.SeekStart); err != nil { + b.mu.RUnlock() // don't hold the mutex forever + return nil, err + } + + go func() { + defer b.mu.RUnlock() + defer close(ch) + + for { + length, err := varint.ReadUvarint(rdr) + if err != nil { + if err != io.EOF { + maybeReportError(ctx, err) + } + return + } + + // Null padding; by default it's an error. + if length == 0 { + if b.opts.ZeroLengthSectionAsEOF { + break + } else { + maybeReportError(ctx, errZeroLengthSection) + return + } + } + + thisItemForNxt, err := rdr.Seek(0, io.SeekCurrent) + if err != nil { + maybeReportError(ctx, err) + return + } + _, c, err := cid.CidFromReader(rdr) + if err != nil { + maybeReportError(ctx, err) + return + } + if _, err := rdr.Seek(thisItemForNxt+int64(length), io.SeekStart); err != nil { + maybeReportError(ctx, err) + return + } + + // If we're just using multihashes, flatten to the "raw" codec. + if !b.opts.BlockstoreUseWholeCIDs { + c = cid.NewCidV1(cid.Raw, c.Hash()) + } + + select { + case ch <- c: + case <-ctx.Done(): + maybeReportError(ctx, ctx.Err()) + return + } + } + }() + return ch, nil +} + +// maybeReportError checks if an error handler is present in context associated to the key +// asyncErrHandlerKey, and if preset it will pass the error to it. +func maybeReportError(ctx context.Context, err error) { + value := ctx.Value(asyncErrHandlerKey) + if eh, _ := value.(func(error)); eh != nil { + eh(err) + } +} + +// HashOnRead is currently unimplemented; hashing on reads never happens. +func (b *ReadOnly) HashOnRead(bool) { + // TODO: implement before the final release? +} + +// Roots returns the root CIDs of the backing CAR. +func (b *ReadOnly) Roots() ([]cid.Cid, error) { + ors, err := internalio.NewOffsetReadSeeker(b.backing, 0) + if err != nil { + return nil, err + } + header, err := carv1.ReadHeader(ors, b.opts.MaxAllowedHeaderSize) + if err != nil { + return nil, fmt.Errorf("error reading car header: %w", err) + } + return header.Roots, nil +} + +// Close closes the underlying reader if it was opened by OpenReadOnly. +// After this call, the blockstore can no longer be used. +// +// Note that this call may block if any blockstore operations are currently in +// progress, including an AllKeysChan that hasn't been fully consumed or cancelled. +func (b *ReadOnly) Close() error { + b.mu.Lock() + defer b.mu.Unlock() + + return b.closeWithoutMutex() +} + +func (b *ReadOnly) closeWithoutMutex() error { + b.closed = true + if b.carv2Closer != nil { + return b.carv2Closer.Close() + } + return nil +} diff --git a/ipld/car/v2/blockstore/readonly_test.go b/ipld/car/v2/blockstore/readonly_test.go new file mode 100644 index 0000000000..f2bc7f1051 --- /dev/null +++ b/ipld/car/v2/blockstore/readonly_test.go @@ -0,0 +1,396 @@ +package blockstore + +import ( + "bytes" + "context" + "io" + "os" + "testing" + "time" + + "github.com/ipfs/go-cid" + format "github.com/ipfs/go-ipld-format" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-merkledag" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/stretchr/testify/require" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/store" +) + +func TestReadOnlyGetReturnsBlockstoreNotFoundWhenCidDoesNotExist(t *testing.T) { + subject, err := OpenReadOnly("../testdata/sample-v1.car") + require.NoError(t, err) + nonExistingKey := merkledag.NewRawNode([]byte("lobstermuncher")).Block.Cid() + + // Assert blockstore API returns blockstore.ErrNotFound + gotBlock, err := subject.Get(context.TODO(), nonExistingKey) + require.IsType(t, format.ErrNotFound{}, err) + require.Nil(t, gotBlock) +} + +func TestReadOnly(t *testing.T) { + tests := []struct { + name string + v1OrV2path string + opts []carv2.Option + noIdCids bool + }{ + { + "OpenedWithCarV1", + "../testdata/sample-v1.car", + []carv2.Option{UseWholeCIDs(true), carv2.StoreIdentityCIDs(true)}, + // index is made, but identity CIDs are included so they'll be found + false, + }, + { + "OpenedWithCarV1_NoIdentityCID", + "../testdata/sample-v1.car", + []carv2.Option{UseWholeCIDs(true)}, + // index is made, identity CIDs are not included, but we always short-circuit when StoreIdentityCIDs(false) + false, + }, + { + "OpenedWithCarV2", + "../testdata/sample-wrapped-v2.car", + []carv2.Option{UseWholeCIDs(true), carv2.StoreIdentityCIDs(true)}, + // index already exists, but was made without identity CIDs, but opening with StoreIdentityCIDs(true) means we check the index + true, + }, + { + "OpenedWithCarV2_NoIdentityCID", + "../testdata/sample-wrapped-v2.car", + []carv2.Option{UseWholeCIDs(true)}, + // index already exists, it was made without identity CIDs, but we always short-circuit when StoreIdentityCIDs(false) + false, + }, + { + "OpenedWithCarV1ZeroLenSection", + "../testdata/sample-v1-with-zero-len-section.car", + []carv2.Option{UseWholeCIDs(true), carv2.ZeroLengthSectionAsEOF(true)}, + false, + }, + { + "OpenedWithAnotherCarV1ZeroLenSection", + "../testdata/sample-v1-with-zero-len-section2.car", + []carv2.Option{UseWholeCIDs(true), carv2.ZeroLengthSectionAsEOF(true)}, + false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.TODO() + + subject, err := OpenReadOnly(tt.v1OrV2path, tt.opts...) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, subject.Close()) }) + + f, err := os.Open(tt.v1OrV2path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + + reader, err := carv2.NewBlockReader(f, tt.opts...) + require.NoError(t, err) + + // Assert roots match v1 payload. + wantRoots := reader.Roots + gotRoots, err := subject.Roots() + require.NoError(t, err) + require.Equal(t, wantRoots, gotRoots) + + var wantCids []cid.Cid + for { + wantBlock, err := reader.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + + key := wantBlock.Cid() + wantCids = append(wantCids, key) + + // Assert blockstore contains key. + has, err := subject.Has(ctx, key) + require.NoError(t, err) + if key.Prefix().MhType == uint64(multicodec.Identity) && tt.noIdCids { + // fixture wasn't made with StoreIdentityCIDs, but we opened it with StoreIdentityCIDs, + // so they aren't there to find + require.False(t, has) + } else { + require.True(t, has) + } + + // Assert size matches block raw data length. + gotSize, err := subject.GetSize(ctx, key) + wantSize := len(wantBlock.RawData()) + require.NoError(t, err) + require.Equal(t, wantSize, gotSize) + + // Assert block itself matches v1 payload block. + if has { + gotBlock, err := subject.Get(ctx, key) + require.NoError(t, err) + require.Equal(t, wantBlock, gotBlock) + } + + // Assert write operations error + require.Error(t, subject.Put(ctx, wantBlock)) + require.Error(t, subject.PutMany(ctx, []blocks.Block{wantBlock})) + require.Error(t, subject.DeleteBlock(ctx, key)) + } + + ctx, cancel := context.WithTimeout(context.Background(), time.Second*2) + defer cancel() + + // Assert all cids in blockstore match v1 payload CIDs. + allKeysChan, err := subject.AllKeysChan(ctx) + require.NoError(t, err) + var gotCids []cid.Cid + for gotKey := range allKeysChan { + gotCids = append(gotCids, gotKey) + } + require.Equal(t, wantCids, gotCids) + }) + } +} + +func TestNewReadOnlyFailsOnUnknownVersion(t *testing.T) { + f, err := os.Open("../testdata/sample-rootless-v42.car") + require.NoError(t, err) + t.Cleanup(func() { f.Close() }) + subject, err := NewReadOnly(f, nil) + require.Errorf(t, err, "unsupported car version: 42") + require.Nil(t, subject) +} + +func TestReadOnlyAllKeysChanErrHandlerCalledOnTimeout(t *testing.T) { + expiredCtx, cancel := context.WithTimeout(context.Background(), -time.Millisecond) + t.Cleanup(cancel) + + subject, err := OpenReadOnly("../testdata/sample-v1.car") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, subject.Close()) }) + + // Make a channel to be able to select and block on until error handler is called. + errHandlerCalled := make(chan interface{}) + expiredErrHandlingCtx := WithAsyncErrorHandler(expiredCtx, func(err error) { + defer close(errHandlerCalled) + require.EqualError(t, err, "context deadline exceeded") + }) + _, err = subject.AllKeysChan(expiredErrHandlingCtx) + require.NoError(t, err) + + // Assert error handler was called with required condition, waiting at most 3 seconds. + select { + case <-errHandlerCalled: + break + case <-time.After(time.Second * 3): + require.Fail(t, "error handler was not called within expected time window") + } +} + +func TestReadOnlyAllKeysChanErrHandlerNeverCalled(t *testing.T) { + tests := []struct { + name string + path string + errHandler func(err error) + wantCIDs []cid.Cid + }{ + { + "ReadingValidCarV1ReturnsNoErrors", + "../testdata/sample-v1.car", + func(err error) { + require.Fail(t, "unexpected call", "error handler called unexpectedly with err: %v", err) + }, + listCids(t, newV1ReaderFromV1File(t, "../testdata/sample-v1.car", false)), + }, + { + "ReadingValidCarV2ReturnsNoErrors", + "../testdata/sample-wrapped-v2.car", + func(err error) { + require.Fail(t, "unexpected call", "error handler called unexpectedly with err: %v", err) + }, + listCids(t, newV1ReaderFromV2File(t, "../testdata/sample-wrapped-v2.car", false)), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + subject, err := OpenReadOnly(tt.path, UseWholeCIDs(true)) + require.NoError(t, err) + ctx := WithAsyncErrorHandler(context.Background(), tt.errHandler) + keysChan, err := subject.AllKeysChan(ctx) + require.NoError(t, err) + var gotCids []cid.Cid + for k := range keysChan { + gotCids = append(gotCids, k) + } + require.Equal(t, tt.wantCIDs, gotCids) + }) + } +} + +func listCids(t *testing.T, v1r *carv1.CarReader) (cids []cid.Cid) { + for { + block, err := v1r.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + cids = append(cids, block.Cid()) + } + return +} + +func newV1ReaderFromV1File(t *testing.T, carv1Path string, zeroLenSectionAsEOF bool) *carv1.CarReader { + f, err := os.Open(carv1Path) + require.NoError(t, err) + t.Cleanup(func() { f.Close() }) + v1r, err := newV1Reader(f, zeroLenSectionAsEOF) + require.NoError(t, err) + return v1r +} + +func newV1ReaderFromV2File(t *testing.T, carv2Path string, zeroLenSectionAsEOF bool) *carv1.CarReader { + f, err := os.Open(carv2Path) + require.NoError(t, err) + t.Cleanup(func() { f.Close() }) + v2r, err := carv2.NewReader(f) + require.NoError(t, err) + dr, err := v2r.DataReader() + require.NoError(t, err) + v1r, err := newV1Reader(dr, zeroLenSectionAsEOF) + require.NoError(t, err) + return v1r +} + +func newV1Reader(r io.Reader, zeroLenSectionAsEOF bool) (*carv1.CarReader, error) { + if zeroLenSectionAsEOF { + return carv1.NewCarReaderWithZeroLengthSectionAsEOF(r) + } + return carv1.NewCarReader(r) +} + +func TestReadOnlyErrorAfterClose(t *testing.T) { + bs, err := OpenReadOnly("../testdata/sample-v1.car") + ctx := context.TODO() + require.NoError(t, err) + + roots, err := bs.Roots() + require.NoError(t, err) + _, err = bs.Has(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.Get(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + _, err = bs.AllKeysChan(ctx) + require.NoError(t, err) + cancel() // to stop the AllKeysChan goroutine + + bs.Close() + + _, err = bs.Roots() + require.Error(t, err) + _, err = bs.Has(ctx, roots[0]) + require.Error(t, err) + _, err = bs.Get(ctx, roots[0]) + require.Error(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.Error(t, err) + _, err = bs.AllKeysChan(ctx) + require.Error(t, err) + + // TODO: test that closing blocks if an AllKeysChan operation is + // in progress. +} + +func TestNewReadOnly_CarV1WithoutIndexWorksAsExpected(t *testing.T) { + carV1Bytes, err := os.ReadFile("../testdata/sample-v1.car") + require.NoError(t, err) + + reader := bytes.NewReader(carV1Bytes) + v1r, err := carv1.NewCarReader(reader) + require.NoError(t, err) + require.Equal(t, uint64(1), v1r.Header.Version) + + // Pick the first block in CARv1 as candidate to check `Get` works. + wantBlock, err := v1r.Next() + require.NoError(t, err) + + // Seek back to the begining of the CARv1 payload. + _, err = reader.Seek(0, io.SeekStart) + require.NoError(t, err) + + subject, err := NewReadOnly(reader, nil, UseWholeCIDs(true)) + require.NoError(t, err) + + // Require that the block is found via ReadOnly API and contetns are as expected. + gotBlock, err := subject.Get(context.TODO(), wantBlock.Cid()) + require.NoError(t, err) + require.Equal(t, wantBlock, gotBlock) +} + +func TestReadOnlyIndex(t *testing.T) { + tests := []struct { + name string + path string + wantCIDs []cid.Cid + }{ + { + "IndexCarV1", + "../testdata/sample-v1.car", + listCids(t, newV1ReaderFromV1File(t, "../testdata/sample-v1.car", false)), + }, + { + "IndexCarV2", + "../testdata/sample-wrapped-v2.car", + listCids(t, newV1ReaderFromV2File(t, "../testdata/sample-wrapped-v2.car", false)), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + subject, err := OpenReadOnly(tt.path, UseWholeCIDs(true)) + require.NoError(t, err) + + idx := subject.Index() + + for _, c := range tt.wantCIDs { + _, isIdentity, err := store.IsIdentity(c) + require.NoError(t, err) + if isIdentity { + // the index doesn't hold identity CIDs + continue + } + _, err = index.GetFirst(idx, c) + require.NoError(t, err) + } + + if idx, ok := idx.(index.IterableIndex); ok { + expected := make([]multihash.Multihash, 0, len(tt.wantCIDs)) + for _, c := range tt.wantCIDs { + _, isIdentity, err := store.IsIdentity(c) + require.NoError(t, err) + if isIdentity { + // the index doesn't hold identity CIDs + continue + } + expected = append(expected, c.Hash()) + } + + var got []multihash.Multihash + err = idx.ForEach(func(m multihash.Multihash, u uint64) error { + got = append(got, m) + return nil + }) + require.NoError(t, err) + require.ElementsMatch(t, expected, got) + } + }) + } +} diff --git a/ipld/car/v2/blockstore/readwrite.go b/ipld/car/v2/blockstore/readwrite.go new file mode 100644 index 0000000000..e33f13104d --- /dev/null +++ b/ipld/car/v2/blockstore/readwrite.go @@ -0,0 +1,389 @@ +package blockstore + +import ( + "context" + "fmt" + "os" + + "github.com/ipfs/go-cid" + blockstore "github.com/ipfs/go-ipfs-blockstore" + blocks "github.com/ipfs/go-libipfs/blocks" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/carv1/util" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/ipld/go-car/v2/internal/store" +) + +var _ blockstore.Blockstore = (*ReadWrite)(nil) + +var ( + errFinalized = fmt.Errorf("cannot write in a carv2 blockstore after finalize") +) + +// ReadWrite implements a blockstore that stores blocks in CARv2 format. +// Blocks put into the blockstore can be read back once they are successfully written. +// This implementation is preferable for a write-heavy workload. +// The blocks are written immediately on Put and PutAll calls, while the index is stored in memory +// and updated incrementally. +// +// The Finalize function must be called once the putting blocks are finished. +// Upon calling Finalize header is finalized and index is written out. +// Once finalized, all read and write calls to this blockstore will result in errors. +type ReadWrite struct { + ronly ReadOnly + + f *os.File + dataWriter *internalio.OffsetWriteSeeker + idx *store.InsertionIndex + header carv2.Header + + finalized bool // also protected by ronly.mu + + opts carv2.Options +} + +var WriteAsCarV1 = carv2.WriteAsCarV1 +var AllowDuplicatePuts = carv2.AllowDuplicatePuts + +// OpenReadWrite creates a new ReadWrite at the given path with a provided set of root CIDs and options. +// +// ReadWrite.Finalize must be called once putting and reading blocks are no longer needed. +// Upon calling ReadWrite.Finalize the CARv2 header and index are written out onto the file and the +// backing file is closed. Once finalized, all read and write calls to this blockstore will result +// in errors. Note, ReadWrite.Finalize must be called on an open instance regardless of whether any +// blocks were put or not. +// +// If a file at given path does not exist, the instantiation will write car.Pragma and data payload +// header (i.e. the inner CARv1 header) onto the file before returning. +// +// When the given path already exists, the blockstore will attempt to resume from it. +// On resumption the existing data sections in file are re-indexed, allowing the caller to continue +// putting any remaining blocks without having to re-ingest blocks for which previous ReadWrite.Put +// returned successfully. +// +// Resumption only works on files that were created by a previous instance of a ReadWrite +// blockstore. This means a file created as a result of a successful call to OpenReadWrite can be +// resumed from as long as write operations such as ReadWrite.Put, ReadWrite.PutMany returned +// successfully. On resumption the roots argument and WithDataPadding option must match the +// previous instantiation of ReadWrite blockstore that created the file. More explicitly, the file +// resuming from must: +// 1. start with a complete CARv2 car.Pragma. +// 2. contain a complete CARv1 data header with root CIDs matching the CIDs passed to the +// constructor, starting at offset optionally padded by WithDataPadding, followed by zero or +// more complete data sections. If any corrupt data sections are present the resumption will fail. +// Note, if set previously, the blockstore must use the same WithDataPadding option as before, +// since this option is used to locate the CARv1 data payload. +// +// Note, resumption should be used with WithCidDeduplication, so that blocks that are successfully +// written into the file are not re-written. Unless, the user explicitly wants duplicate blocks. +// +// Resuming from finalized files is allowed. However, resumption will regenerate the index +// regardless by scanning every existing block in file. +func OpenReadWrite(path string, roots []cid.Cid, opts ...carv2.Option) (*ReadWrite, error) { + f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o666) // TODO: Should the user be able to configure FileMode permissions? + if err != nil { + return nil, fmt.Errorf("could not open read/write file: %w", err) + } + // If construction of blockstore fails, make sure to close off the open file. + defer func() { + if err != nil { + f.Close() + } + }() + rwbs, err := OpenReadWriteFile(f, roots, opts...) + if err != nil { + return nil, err + } + // close the file when finalizing + rwbs.ronly.carv2Closer = rwbs.f + return rwbs, nil +} + +// OpenReadWriteFile is similar as OpenReadWrite but lets you control the file lifecycle. +// You are responsible for closing the given file. +func OpenReadWriteFile(f *os.File, roots []cid.Cid, opts ...carv2.Option) (*ReadWrite, error) { + stat, err := f.Stat() + if err != nil { + // Note, we should not get an os.ErrNotExist here because the flags used to open file includes os.O_CREATE + return nil, err + } + // Try and resume by default if the file size is non-zero. + resume := stat.Size() != 0 + + // Instantiate block store. + // Set the header fileld before applying options since padding options may modify header. + rwbs := &ReadWrite{ + f: f, + idx: store.NewInsertionIndex(), + header: carv2.NewHeader(0), + opts: carv2.ApplyOptions(opts...), + finalized: false, + } + rwbs.ronly.opts = rwbs.opts + + if p := rwbs.opts.DataPadding; p > 0 { + rwbs.header = rwbs.header.WithDataPadding(p) + } + if p := rwbs.opts.IndexPadding; p > 0 { + rwbs.header = rwbs.header.WithIndexPadding(p) + } + + offset := int64(rwbs.header.DataOffset) + if rwbs.opts.WriteAsCarV1 { + offset = 0 + } + rwbs.dataWriter = internalio.NewOffsetWriter(rwbs.f, offset) + var v1r internalio.ReadSeekerAt + v1r, err = internalio.NewOffsetReadSeeker(rwbs.f, offset) + if err != nil { + return nil, err + } + rwbs.ronly.backing = v1r + rwbs.ronly.idx = rwbs.idx + + if resume { + if err = store.ResumableVersion(f, rwbs.opts.WriteAsCarV1); err != nil { + return nil, err + } + if err = store.Resume( + f, + rwbs.ronly.backing, + rwbs.dataWriter, + rwbs.idx, + roots, + rwbs.header.DataOffset, + rwbs.opts.WriteAsCarV1, + rwbs.opts.MaxAllowedHeaderSize, + rwbs.opts.ZeroLengthSectionAsEOF, + ); err != nil { + return nil, err + } + } else { + if err = rwbs.initWithRoots(!rwbs.opts.WriteAsCarV1, roots); err != nil { + return nil, err + } + } + + return rwbs, nil +} + +func (b *ReadWrite) initWithRoots(v2 bool, roots []cid.Cid) error { + if v2 { + if _, err := b.f.WriteAt(carv2.Pragma, 0); err != nil { + return err + } + } + return carv1.WriteHeader(&carv1.CarHeader{Roots: roots, Version: 1}, b.dataWriter) +} + +// Index gives direct access to the index. +// You should never add records on your own there. +func (b *ReadWrite) Index() index.Index { + return b.idx +} + +// Put puts a given block to the underlying datastore +func (b *ReadWrite) Put(ctx context.Context, blk blocks.Block) error { + // PutMany already checks b.ronly.closed. + return b.PutMany(ctx, []blocks.Block{blk}) +} + +// PutMany puts a slice of blocks at the same time using batching +// capabilities of the underlying datastore whenever possible. +func (b *ReadWrite) PutMany(ctx context.Context, blks []blocks.Block) error { + b.ronly.mu.Lock() + defer b.ronly.mu.Unlock() + + if b.ronly.closed { + return errClosed + } + if b.finalized { + return errFinalized + } + + for _, bl := range blks { + c := bl.Cid() + + if should, err := store.ShouldPut( + b.idx, + c, + b.opts.MaxIndexCidSize, + b.opts.StoreIdentityCIDs, + b.opts.BlockstoreAllowDuplicatePuts, + b.opts.BlockstoreUseWholeCIDs, + ); err != nil { + return err + } else if !should { + continue + } + + n := uint64(b.dataWriter.Position()) + if err := util.LdWrite(b.dataWriter, c.Bytes(), bl.RawData()); err != nil { + return err + } + b.idx.InsertNoReplace(c, n) + } + return nil +} + +// Discard closes this blockstore without finalizing its header and index. +// After this call, the blockstore can no longer be used. +// +// Note that this call may block if any blockstore operations are currently in +// progress, including an AllKeysChan that hasn't been fully consumed or cancelled. +func (b *ReadWrite) Discard() { + // Same semantics as ReadOnly.Close, including allowing duplicate calls. + // The only difference is that our method is called Discard, + // to further clarify that we're not properly finalizing and writing a + // CARv2 file. + b.ronly.Close() +} + +// Finalize finalizes this blockstore by writing the CARv2 header, along with flattened index +// for more efficient subsequent read. +// This is the equivalent to calling FinalizeReadOnly and Close. +// After this call, the blockstore can no longer be used. +func (b *ReadWrite) Finalize() error { + b.ronly.mu.Lock() + defer b.ronly.mu.Unlock() + + for _, err := range []error{b.finalizeReadOnlyWithoutMutex(), b.closeWithoutMutex()} { + if err != nil { + return err + } + } + return nil +} + +// Finalize finalizes this blockstore by writing the CARv2 header, along with flattened index +// for more efficient subsequent read, but keep it open read-only. +// This call should be complemented later by a call to Close. +func (b *ReadWrite) FinalizeReadOnly() error { + b.ronly.mu.Lock() + defer b.ronly.mu.Unlock() + + return b.finalizeReadOnlyWithoutMutex() +} + +func (b *ReadWrite) finalizeReadOnlyWithoutMutex() error { + if b.opts.WriteAsCarV1 { + // all blocks are already properly written to the CARv1 inner container and there's + // no additional finalization required at the end of the file for a complete v1 + b.finalized = true + return nil + } + + if b.ronly.closed { + // Allow duplicate Finalize calls, just like Close. + // Still error, just like ReadOnly.Close; it should be discarded. + return fmt.Errorf("called Finalize or FinalizeReadOnly on a closed blockstore") + } + if b.finalized { + return fmt.Errorf("called Finalize or FinalizeReadOnly on an already finalized blockstore") + } + + b.finalized = true + + return store.Finalize(b.f, b.header, b.idx, uint64(b.dataWriter.Position()), b.opts.StoreIdentityCIDs, b.opts.IndexCodec) +} + +// Close closes the blockstore. +// After this call, the blockstore can no longer be used. +func (b *ReadWrite) Close() error { + b.ronly.mu.Lock() + defer b.ronly.mu.Unlock() + + return b.closeWithoutMutex() +} + +func (b *ReadWrite) closeWithoutMutex() error { + if !b.opts.WriteAsCarV1 && !b.finalized { + return fmt.Errorf("called Close without FinalizeReadOnly first") + } + if b.ronly.closed { + // Allow duplicate Close calls + // Still error, just like ReadOnly.Close; it should be discarded. + return fmt.Errorf("called Close on a closed blockstore") + } + + if err := b.ronly.closeWithoutMutex(); err != nil { + return err + } + return nil +} + +func (b *ReadWrite) AllKeysChan(ctx context.Context) (<-chan cid.Cid, error) { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + b.ronly.mu.Lock() + defer b.ronly.mu.Unlock() + + if b.ronly.closed { + return nil, errClosed + } + + out := make(chan cid.Cid) + + go func() { + defer close(out) + err := b.idx.ForEachCid(func(c cid.Cid, _ uint64) error { + if !b.opts.BlockstoreUseWholeCIDs { + c = cid.NewCidV1(cid.Raw, c.Hash()) + } + select { + case out <- c: + case <-ctx.Done(): + return ctx.Err() + } + return nil + }) + if err != nil { + maybeReportError(ctx, err) + } + }() + + return out, nil +} + +func (b *ReadWrite) Has(ctx context.Context, key cid.Cid) (bool, error) { + b.ronly.mu.Lock() + defer b.ronly.mu.Unlock() + + if b.ronly.closed { + return false, errClosed + } + + return store.Has( + b.idx, + key, + b.opts.MaxIndexCidSize, + b.opts.StoreIdentityCIDs, + b.opts.BlockstoreAllowDuplicatePuts, + b.opts.BlockstoreUseWholeCIDs, + ) +} + +func (b *ReadWrite) Get(ctx context.Context, key cid.Cid) (blocks.Block, error) { + return b.ronly.Get(ctx, key) +} + +func (b *ReadWrite) GetSize(ctx context.Context, key cid.Cid) (int, error) { + return b.ronly.GetSize(ctx, key) +} + +func (b *ReadWrite) DeleteBlock(_ context.Context, _ cid.Cid) error { + return fmt.Errorf("ReadWrite blockstore does not support deleting blocks") +} + +func (b *ReadWrite) HashOnRead(enable bool) { + b.ronly.HashOnRead(enable) +} + +func (b *ReadWrite) Roots() ([]cid.Cid, error) { + return b.ronly.Roots() +} diff --git a/ipld/car/v2/blockstore/readwrite_test.go b/ipld/car/v2/blockstore/readwrite_test.go new file mode 100644 index 0000000000..40731e5489 --- /dev/null +++ b/ipld/car/v2/blockstore/readwrite_test.go @@ -0,0 +1,1177 @@ +package blockstore_test + +import ( + "context" + "crypto/sha512" + "fmt" + "io" + "math/rand" + "os" + "path" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + format "github.com/ipfs/go-ipld-format" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-merkledag" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" +) + +var ( + rng = rand.New(rand.NewSource(1413)) + oneTestBlockWithCidV1 = merkledag.NewRawNode([]byte("fish")).Block + anotherTestBlockWithCidV0 = blocks.NewBlock([]byte("barreleye")) +) + +func TestReadWriteGetReturnsBlockstoreNotFoundWhenCidDoesNotExist(t *testing.T) { + path := filepath.Join(t.TempDir(), "readwrite-err-not-found.car") + subject, err := blockstore.OpenReadWrite(path, []cid.Cid{}) + t.Cleanup(func() { subject.Finalize() }) + require.NoError(t, err) + nonExistingKey := merkledag.NewRawNode([]byte("undadasea")).Block.Cid() + + // Assert blockstore API returns blockstore.ErrNotFound + gotBlock, err := subject.Get(context.TODO(), nonExistingKey) + require.IsType(t, format.ErrNotFound{}, err) + require.Nil(t, gotBlock) +} + +func TestBlockstore(t *testing.T) { + originalCARv1Path := "../testdata/sample-v1.car" + originalCARv1ComparePath := "../testdata/sample-v1-noidentity.car" + originalCARv1ComparePathStat, err := os.Stat(originalCARv1ComparePath) + require.NoError(t, err) + + variants := []struct { + name string + options []carv2.Option + expectedV1StartOffset int64 + }{ + // no options, expect a standard CARv2 with the noidentity inner CARv1 + {"noopt_carv2", []carv2.Option{}, int64(carv2.PragmaSize + carv2.HeaderSize)}, + // option to only write as a CARv1, expect the noidentity inner CARv1 + {"carv1", []carv2.Option{blockstore.WriteAsCarV1(true)}, int64(0)}, + } + + for _, variant := range variants { + t.Run(variant.name, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + f, err := os.Open(originalCARv1Path) + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, f.Close()) }) + r, err := carv1.NewCarReader(f) + require.NoError(t, err) + + path := filepath.Join(t.TempDir(), fmt.Sprintf("readwrite_%s.car", variant.name)) + ingester, err := blockstore.OpenReadWrite(path, r.Header.Roots, variant.options...) + require.NoError(t, err) + t.Cleanup(func() { ingester.Finalize() }) + + cids := make([]cid.Cid, 0) + var idCidCount int + for { + b, err := r.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + + err = ingester.Put(ctx, b) + require.NoError(t, err) + cids = append(cids, b.Cid()) + + // try reading a random one: + candidate := cids[rng.Intn(len(cids))] + if has, err := ingester.Has(ctx, candidate); !has || err != nil { + t.Fatalf("expected to find %s but didn't: %s", candidate, err) + } + + dmh, err := multihash.Decode(b.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + idCidCount++ + } + } + + for _, c := range cids { + b, err := ingester.Get(ctx, c) + require.NoError(t, err) + if !b.Cid().Equals(c) { + t.Fatal("wrong item returned") + } + } + + err = ingester.Finalize() + require.NoError(t, err) + robs, err := blockstore.OpenReadOnly(path) + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, robs.Close()) }) + + allKeysCh, err := robs.AllKeysChan(ctx) + require.NoError(t, err) + numKeysCh := 0 + for c := range allKeysCh { + b, err := robs.Get(ctx, c) + require.NoError(t, err) + if !b.Cid().Equals(c) { + t.Fatal("wrong item returned") + } + numKeysCh++ + } + expectedCidCount := len(cids) - idCidCount + require.Equal(t, expectedCidCount, numKeysCh, "AllKeysChan returned an unexpected amount of keys; expected %v but got %v", expectedCidCount, numKeysCh) + + for _, c := range cids { + b, err := robs.Get(ctx, c) + require.NoError(t, err) + if !b.Cid().Equals(c) { + t.Fatal("wrong item returned") + } + } + + wrote, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, wrote.Close()) }) + _, err = wrote.Seek(variant.expectedV1StartOffset, io.SeekStart) + require.NoError(t, err) + hasher := sha512.New() + gotWritten, err := io.Copy(hasher, io.LimitReader(wrote, originalCARv1ComparePathStat.Size())) + require.NoError(t, err) + gotSum := hasher.Sum(nil) + + hasher.Reset() + originalCarV1, err := os.Open(originalCARv1ComparePath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, originalCarV1.Close()) }) + wantWritten, err := io.Copy(hasher, originalCarV1) + require.NoError(t, err) + wantSum := hasher.Sum(nil) + + require.Equal(t, wantWritten, gotWritten) + require.Equal(t, wantSum, gotSum) + }) + } +} + +func TestBlockstorePutSameHashes(t *testing.T) { + tdir := t.TempDir() + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + // This blockstore allows duplicate puts, + // and identifies by multihash as per the default. + wbsAllowDups, err := blockstore.OpenReadWrite( + filepath.Join(tdir, "readwrite-allowdup.car"), nil, + blockstore.AllowDuplicatePuts(true), + ) + require.NoError(t, err) + t.Cleanup(func() { wbsAllowDups.Finalize() }) + + // This blockstore deduplicates puts by CID. + wbsByCID, err := blockstore.OpenReadWrite( + filepath.Join(tdir, "readwrite-dedup-wholecid.car"), nil, + blockstore.UseWholeCIDs(true), + ) + require.NoError(t, err) + t.Cleanup(func() { wbsByCID.Finalize() }) + + // This blockstore deduplicates puts by multihash. + wbsByHash, err := blockstore.OpenReadWrite( + filepath.Join(tdir, "readwrite-dedup-hash.car"), nil, + ) + require.NoError(t, err) + t.Cleanup(func() { wbsByHash.Finalize() }) + + var blockList []blocks.Block + + appendBlock := func(data []byte, version, codec uint64) { + c, err := cid.Prefix{ + Version: version, + Codec: codec, + MhType: multihash.SHA2_256, + MhLength: -1, + }.Sum(data) + require.NoError(t, err) + + block, err := blocks.NewBlockWithCid(data, c) + require.NoError(t, err) + + blockList = append(blockList, block) + } + + // Two raw blocks, meaning we have two unique multihashes. + // However, we have multiple CIDs for each multihash. + // We also have two duplicate CIDs. + data1 := []byte("foo bar") + appendBlock(data1, 0, cid.DagProtobuf) + appendBlock(data1, 1, cid.DagProtobuf) + appendBlock(data1, 1, cid.DagCBOR) + appendBlock(data1, 1, cid.DagCBOR) // duplicate CID + + data2 := []byte("foo bar baz") + appendBlock(data2, 0, cid.DagProtobuf) + appendBlock(data2, 1, cid.DagProtobuf) + appendBlock(data2, 1, cid.DagProtobuf) // duplicate CID + appendBlock(data2, 1, cid.DagCBOR) + + countBlocks := func(bs *blockstore.ReadWrite) int { + ch, err := bs.AllKeysChan(context.Background()) + require.NoError(t, err) + + n := 0 + for c := range ch { + if c.Prefix().Codec == cid.Raw { + if bs == wbsByCID { + t.Error("expected blockstore with UseWholeCIDs to not flatten on AllKeysChan") + } + } else { + if bs != wbsByCID { + t.Error("expected blockstore without UseWholeCIDs to flatten on AllKeysChan") + } + } + n++ + } + return n + } + + putBlockList := func(bs *blockstore.ReadWrite) { + for i, block := range blockList { + // Has should never error here. + // The first block should be missing. + // Others might not, given the duplicate hashes. + has, err := bs.Has(ctx, block.Cid()) + require.NoError(t, err) + if i == 0 { + require.False(t, has) + } + + err = bs.Put(ctx, block) + require.NoError(t, err) + + // Has, Get, and GetSize need to work right after a Put. + has, err = bs.Has(ctx, block.Cid()) + require.NoError(t, err) + require.True(t, has) + + got, err := bs.Get(ctx, block.Cid()) + require.NoError(t, err) + require.Equal(t, block.Cid(), got.Cid()) + require.Equal(t, block.RawData(), got.RawData()) + + size, err := bs.GetSize(ctx, block.Cid()) + require.NoError(t, err) + require.Equal(t, len(block.RawData()), size) + } + } + + putBlockList(wbsAllowDups) + require.Equal(t, len(blockList), countBlocks(wbsAllowDups)) + + err = wbsAllowDups.Finalize() + require.NoError(t, err) + + // Put the same list of blocks to the blockstore that + // deduplicates by CID. + // We should end up with two fewer blocks, + // as two are entire CID duplicates. + putBlockList(wbsByCID) + require.Equal(t, len(blockList)-2, countBlocks(wbsByCID)) + + err = wbsByCID.Finalize() + require.NoError(t, err) + + // Put the same list of blocks to the blockstore that + // deduplicates by CID. + // We should end up with just two blocks, + // as the original set of blocks only has two distinct multihashes. + putBlockList(wbsByHash) + require.Equal(t, 2, countBlocks(wbsByHash)) + + err = wbsByHash.Finalize() + require.NoError(t, err) +} + +func TestBlockstoreConcurrentUse(t *testing.T) { + wbs, err := blockstore.OpenReadWrite(filepath.Join(t.TempDir(), "readwrite.car"), nil) + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + require.NoError(t, err) + t.Cleanup(func() { wbs.Finalize() }) + + var wg sync.WaitGroup + for i := 0; i < 100; i++ { + data := []byte(fmt.Sprintf("data-%d", i)) + + wg.Add(1) + go func() { + defer wg.Done() + + c, err := cid.Prefix{ + Version: 1, + Codec: cid.Raw, + MhType: multihash.SHA2_256, + MhLength: -1, + }.Sum(data) + require.NoError(t, err) + + block, err := blocks.NewBlockWithCid(data, c) + require.NoError(t, err) + + has, err := wbs.Has(ctx, block.Cid()) + require.NoError(t, err) + require.False(t, has) + + err = wbs.Put(ctx, block) + require.NoError(t, err) + + got, err := wbs.Get(ctx, block.Cid()) + require.NoError(t, err) + require.Equal(t, data, got.RawData()) + }() + } + wg.Wait() +} + +type bufferReaderAt []byte + +func (b bufferReaderAt) ReadAt(p []byte, off int64) (int, error) { + if off >= int64(len(b)) { + return 0, io.EOF + } + return copy(p, b[off:]), nil +} + +func TestBlockstoreNullPadding(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + paddedV1, err := os.ReadFile("../testdata/sample-v1-with-zero-len-section.car") + require.NoError(t, err) + + rbs, err := blockstore.NewReadOnly(bufferReaderAt(paddedV1), nil, + carv2.ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + + roots, err := rbs.Roots() + require.NoError(t, err) + + has, err := rbs.Has(ctx, roots[0]) + require.NoError(t, err) + require.True(t, has) + + allKeysCh, err := rbs.AllKeysChan(ctx) + require.NoError(t, err) + for c := range allKeysCh { + b, err := rbs.Get(ctx, c) + require.NoError(t, err) + if !b.Cid().Equals(c) { + t.Fatal("wrong item returned") + } + } +} + +func TestBlockstoreResumption(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + v1f, err := os.Open("../testdata/sample-v1.car") + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, v1f.Close()) }) + r, err := carv1.NewCarReader(v1f) + require.NoError(t, err) + + path := filepath.Join(t.TempDir(), "readwrite-resume.car") + // Create an incomplete CARv2 file with no blocks put. + subject, err := blockstore.OpenReadWrite(path, r.Header.Roots, + blockstore.UseWholeCIDs(true)) + require.NoError(t, err) + + // For each block resume on the same file, putting blocks one at a time. + var wantBlockCountSoFar, idCidCount int + wantBlocks := make(map[cid.Cid]blocks.Block) + for { + b, err := r.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + wantBlockCountSoFar++ + wantBlocks[b.Cid()] = b + + dmh, err := multihash.Decode(b.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + idCidCount++ + } + + // 30% chance of subject failing; more concretely: re-instantiating blockstore with the same + // file without calling Finalize. The higher this percentage the slower the test runs + // considering the number of blocks in the original CARv1 test payload. + resume := rng.Float32() <= 0.3 + // If testing resume case, then flip a coin to decide whether to finalize before blockstore + // re-instantiation or not. Note, both cases should work for resumption since we do not + // limit resumption to unfinalized files. + finalizeBeforeResumption := rng.Float32() <= 0.5 + if resume { + if finalizeBeforeResumption { + require.NoError(t, subject.Finalize()) + } else { + // Close off the open file and re-instantiate a new subject with resumption enabled. + // Note, we don't have to close the file for resumption to work. + // We do this to avoid resource leak during testing. + subject.Discard() + } + subject, err = blockstore.OpenReadWrite(path, r.Header.Roots, + blockstore.UseWholeCIDs(true)) + require.NoError(t, err) + } + require.NoError(t, subject.Put(ctx, b)) + + // With 10% chance test read operations on an resumed read-write blockstore. + // We don't test on every put to reduce test runtime. + testRead := rng.Float32() <= 0.1 + if testRead { + // Assert read operations on the read-write blockstore are as expected when resumed from an + // existing file + var gotBlockCountSoFar int + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + t.Cleanup(cancel) + keysChan, err := subject.AllKeysChan(ctx) + require.NoError(t, err) + for k := range keysChan { + has, err := subject.Has(ctx, k) + require.NoError(t, err) + require.True(t, has) + gotBlock, err := subject.Get(ctx, k) + require.NoError(t, err) + require.Equal(t, wantBlocks[k], gotBlock) + gotBlockCountSoFar++ + } + // Assert the number of blocks in file are as expected calculated via AllKeysChan + require.Equal(t, wantBlockCountSoFar-idCidCount, gotBlockCountSoFar) + } + } + subject.Discard() + + // Finalize the blockstore to complete partially written CARv2 file. + subject, err = blockstore.OpenReadWrite(path, r.Header.Roots, + blockstore.UseWholeCIDs(true)) + require.NoError(t, err) + require.NoError(t, subject.Finalize()) + + // Assert resumed from file is a valid CARv2 with index. + v2f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, v2f.Close()) }) + v2r, err := carv2.NewReader(v2f) + require.NoError(t, err) + require.True(t, v2r.Header.HasIndex()) + + // Assert CARv1 payload in file matches the original CARv1 payload. + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + wantPayloadReader, err := carv1.NewCarReader(v1f) + require.NoError(t, err) + + dr, err := v2r.DataReader() + require.NoError(t, err) + gotPayloadReader, err := carv1.NewCarReader(dr) + require.NoError(t, err) + + require.Equal(t, wantPayloadReader.Header, gotPayloadReader.Header) + for { + wantNextBlock, wantErr := wantPayloadReader.Next() + if wantErr == io.EOF { + gotNextBlock, gotErr := gotPayloadReader.Next() + require.Equal(t, wantErr, gotErr) + require.Nil(t, gotNextBlock) + break + } + require.NoError(t, wantErr) + + dmh, err := multihash.Decode(wantNextBlock.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + continue + } + + gotNextBlock, gotErr := gotPayloadReader.Next() + require.NoError(t, gotErr) + require.Equal(t, wantNextBlock, gotNextBlock) + } + + // Assert index in resumed from file is identical to index generated from the data payload portion of the generated CARv2 file. + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + ir, err := v2r.IndexReader() + require.NoError(t, err) + gotIdx, err := index.ReadFrom(ir) + require.NoError(t, err) + dr, err = v2r.DataReader() + require.NoError(t, err) + wantIdx, err := carv2.GenerateIndex(dr) + require.NoError(t, err) + require.Equal(t, wantIdx, gotIdx) +} + +func TestBlockstoreResumptionIsSupportedOnFinalizedFile(t *testing.T) { + path := filepath.Join(t.TempDir(), "readwrite-resume-finalized.car") + // Create an incomplete CARv2 file with no blocks put. + subject, err := blockstore.OpenReadWrite(path, []cid.Cid{}) + require.NoError(t, err) + require.NoError(t, subject.Finalize()) + subject, err = blockstore.OpenReadWrite(path, []cid.Cid{}) + require.NoError(t, err) + t.Cleanup(func() { subject.Finalize() }) +} + +func TestReadWritePanicsOnlyWhenFinalized(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + oneTestBlockCid := oneTestBlockWithCidV1.Cid() + anotherTestBlockCid := anotherTestBlockWithCidV0.Cid() + wantRoots := []cid.Cid{oneTestBlockCid, anotherTestBlockCid} + path := filepath.Join(t.TempDir(), "readwrite-finalized-panic.car") + + subject, err := blockstore.OpenReadWrite(path, wantRoots) + require.NoError(t, err) + + require.NoError(t, subject.Put(ctx, oneTestBlockWithCidV1)) + require.NoError(t, subject.Put(ctx, anotherTestBlockWithCidV0)) + + gotBlock, err := subject.Get(ctx, oneTestBlockCid) + require.NoError(t, err) + require.Equal(t, oneTestBlockWithCidV1, gotBlock) + + gotSize, err := subject.GetSize(ctx, oneTestBlockCid) + require.NoError(t, err) + require.Equal(t, len(oneTestBlockWithCidV1.RawData()), gotSize) + + gotRoots, err := subject.Roots() + require.NoError(t, err) + require.Equal(t, wantRoots, gotRoots) + + has, err := subject.Has(ctx, oneTestBlockCid) + require.NoError(t, err) + require.True(t, has) + + subject.HashOnRead(true) + // Delete should always error regardless of finalize + require.Error(t, subject.DeleteBlock(ctx, oneTestBlockCid)) + + require.NoError(t, subject.Finalize()) + require.Error(t, subject.Finalize()) + + _, err = subject.Get(ctx, oneTestBlockCid) + require.Error(t, err) + _, err = subject.GetSize(ctx, anotherTestBlockCid) + require.Error(t, err) + _, err = subject.Has(ctx, anotherTestBlockCid) + require.Error(t, err) + + require.Error(t, subject.Put(ctx, oneTestBlockWithCidV1)) + require.Error(t, subject.PutMany(ctx, []blocks.Block{anotherTestBlockWithCidV0})) + _, err = subject.AllKeysChan(context.Background()) + require.Error(t, err) + require.Error(t, subject.DeleteBlock(ctx, oneTestBlockCid)) +} + +func TestReadWriteWithPaddingWorksAsExpected(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + oneTestBlockCid := oneTestBlockWithCidV1.Cid() + anotherTestBlockCid := anotherTestBlockWithCidV0.Cid() + WantRoots := []cid.Cid{oneTestBlockCid, anotherTestBlockCid} + path := filepath.Join(t.TempDir(), "readwrite-with-padding.car") + + wantCarV1Padding := uint64(1413) + wantIndexPadding := uint64(1314) + subject, err := blockstore.OpenReadWrite( + path, + WantRoots, + carv2.UseDataPadding(wantCarV1Padding), + carv2.UseIndexPadding(wantIndexPadding)) + require.NoError(t, err) + require.NoError(t, subject.Put(ctx, oneTestBlockWithCidV1)) + require.NoError(t, subject.Put(ctx, anotherTestBlockWithCidV0)) + require.NoError(t, subject.Finalize()) + + // Assert CARv2 header contains right offsets. + gotCarV2, err := carv2.OpenReader(path) + t.Cleanup(func() { gotCarV2.Close() }) + require.NoError(t, err) + wantCarV1Offset := carv2.PragmaSize + carv2.HeaderSize + wantCarV1Padding + wantIndexOffset := wantCarV1Offset + gotCarV2.Header.DataSize + wantIndexPadding + require.Equal(t, wantCarV1Offset, gotCarV2.Header.DataOffset) + require.Equal(t, wantIndexOffset, gotCarV2.Header.IndexOffset) + require.NoError(t, gotCarV2.Close()) + + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { f.Close() }) + + // Assert reading CARv1 directly at offset and size is as expected. + gotCarV1, err := carv1.NewCarReader(io.NewSectionReader(f, int64(wantCarV1Offset), int64(gotCarV2.Header.DataSize))) + require.NoError(t, err) + require.Equal(t, WantRoots, gotCarV1.Header.Roots) + gotOneBlock, err := gotCarV1.Next() + require.NoError(t, err) + require.Equal(t, oneTestBlockWithCidV1, gotOneBlock) + gotAnotherBlock, err := gotCarV1.Next() + require.NoError(t, err) + require.Equal(t, anotherTestBlockWithCidV0, gotAnotherBlock) + _, err = gotCarV1.Next() + require.Equal(t, io.EOF, err) + + // Assert reading index directly from file is parsable and has expected CIDs. + stat, err := f.Stat() + require.NoError(t, err) + indexSize := stat.Size() - int64(wantIndexOffset) + gotIdx, err := index.ReadFrom(io.NewSectionReader(f, int64(wantIndexOffset), indexSize)) + require.NoError(t, err) + _, err = index.GetFirst(gotIdx, oneTestBlockCid) + require.NoError(t, err) + _, err = index.GetFirst(gotIdx, anotherTestBlockCid) + require.NoError(t, err) +} + +func TestReadWriteResumptionFromNonV2FileIsError(t *testing.T) { + tmpPath := requireTmpCopy(t, "../testdata/sample-rootless-v42.car") + subject, err := blockstore.OpenReadWrite(tmpPath, []cid.Cid{}) + require.EqualError(t, err, "cannot resume on CAR file with version 42") + require.Nil(t, subject) +} + +func TestReadWriteResumptionMismatchingRootsIsError(t *testing.T) { + tmpPath := requireTmpCopy(t, "../testdata/sample-wrapped-v2.car") + + origContent, err := os.ReadFile(tmpPath) + require.NoError(t, err) + + badRoot, err := cid.NewPrefixV1(cid.Raw, multihash.SHA2_256).Sum([]byte("bad root")) + require.NoError(t, err) + + subject, err := blockstore.OpenReadWrite(tmpPath, []cid.Cid{badRoot}) + require.EqualError(t, err, "cannot resume on file with mismatching data header") + require.Nil(t, subject) + + newContent, err := os.ReadFile(tmpPath) + require.NoError(t, err) + + // Expect the bad file to be left untouched; check the size first. + // If the sizes mismatch, printing a huge diff would not help us. + require.Equal(t, len(origContent), len(newContent)) + require.Equal(t, origContent, newContent) +} + +func requireTmpCopy(t *testing.T, src string) string { + srcF, err := os.Open(src) + require.NoError(t, err) + defer func() { require.NoError(t, srcF.Close()) }() + stats, err := srcF.Stat() + require.NoError(t, err) + + dst := filepath.Join(t.TempDir(), stats.Name()) + dstF, err := os.Create(dst) + require.NoError(t, err) + defer func() { require.NoError(t, dstF.Close()) }() + + _, err = io.Copy(dstF, srcF) + require.NoError(t, err) + return dst +} + +func TestReadWriteResumptionFromFileWithDifferentCarV1PaddingIsError(t *testing.T) { + oneTestBlockCid := oneTestBlockWithCidV1.Cid() + WantRoots := []cid.Cid{oneTestBlockCid} + path := filepath.Join(t.TempDir(), "readwrite-resume-with-padding.car") + + subject, err := blockstore.OpenReadWrite( + path, + WantRoots, + carv2.UseDataPadding(1413)) + require.NoError(t, err) + require.NoError(t, subject.Put(context.TODO(), oneTestBlockWithCidV1)) + require.NoError(t, subject.Finalize()) + + resumingSubject, err := blockstore.OpenReadWrite( + path, + WantRoots, + carv2.UseDataPadding(1314)) + require.EqualError(t, err, "cannot resume from file with mismatched CARv1 offset; "+ + "`WithDataPadding` option must match the padding on file. "+ + "Expected padding value of 1413 but got 1314") + require.Nil(t, resumingSubject) +} + +func TestReadWriteErrorAfterClose(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + root := blocks.NewBlock([]byte("foo")) + for _, closeMethod := range []func(*blockstore.ReadWrite){ + (*blockstore.ReadWrite).Discard, + func(bs *blockstore.ReadWrite) { bs.Finalize() }, + } { + path := filepath.Join(t.TempDir(), "readwrite.car") + bs, err := blockstore.OpenReadWrite(path, []cid.Cid{root.Cid()}) + require.NoError(t, err) + + err = bs.Put(ctx, root) + require.NoError(t, err) + + roots, err := bs.Roots() + require.NoError(t, err) + _, err = bs.Has(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.Get(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + _, err = bs.AllKeysChan(ctx) + require.NoError(t, err) + cancel() // to stop the AllKeysChan goroutine + + closeMethod(bs) + + _, err = bs.Roots() + require.Error(t, err) + _, err = bs.Has(ctx, roots[0]) + require.Error(t, err) + _, err = bs.Get(ctx, roots[0]) + require.Error(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.Error(t, err) + _, err = bs.AllKeysChan(ctx) + require.Error(t, err) + + err = bs.Put(ctx, root) + require.Error(t, err) + + // TODO: test that closing blocks if an AllKeysChan operation is + // in progress. + } +} + +func TestOpenReadWrite_WritesIdentityCIDsWhenOptionIsEnabled(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + path := filepath.Join(t.TempDir(), "readwrite-with-id-enabled.car") + subject, err := blockstore.OpenReadWrite(path, []cid.Cid{}, carv2.StoreIdentityCIDs(true)) + require.NoError(t, err) + + data := []byte("fish") + idmh, err := multihash.Sum(data, multihash.IDENTITY, -1) + require.NoError(t, err) + idCid := cid.NewCidV1(uint64(multicodec.Raw), idmh) + + idBlock, err := blocks.NewBlockWithCid(data, idCid) + require.NoError(t, err) + err = subject.Put(ctx, idBlock) + require.NoError(t, err) + + has, err := subject.Has(ctx, idCid) + require.NoError(t, err) + require.True(t, has) + + gotBlock, err := subject.Get(ctx, idCid) + require.NoError(t, err) + require.Equal(t, idBlock, gotBlock) + + keysChan, err := subject.AllKeysChan(context.Background()) + require.NoError(t, err) + var i int + for c := range keysChan { + i++ + require.Equal(t, idCid, c) + } + require.Equal(t, 1, i) + + err = subject.Finalize() + require.NoError(t, err) + + // Assert resulting CAR file indeed has the IDENTITY block. + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + + reader, err := carv2.NewBlockReader(f) + require.NoError(t, err) + + gotBlock, err = reader.Next() + require.NoError(t, err) + require.Equal(t, idBlock, gotBlock) + + next, err := reader.Next() + require.Equal(t, io.EOF, err) + require.Nil(t, next) + + // Assert the id is indexed. + r, err := carv2.OpenReader(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, r.Close()) }) + require.True(t, r.Header.HasIndex()) + + ir, err := r.IndexReader() + require.NoError(t, err) + require.NotNil(t, ir) + + gotIdx, err := index.ReadFrom(ir) + require.NoError(t, err) + + // Determine expected offset as the length of header plus one + dr, err := r.DataReader() + require.NoError(t, err) + header, err := carv1.ReadHeader(dr, carv1.DefaultMaxAllowedHeaderSize) + require.NoError(t, err) + object, err := cbor.DumpObject(header) + require.NoError(t, err) + expectedOffset := len(object) + 1 + + // Assert index is iterable and has exactly one record with expected multihash and offset. + switch idx := gotIdx.(type) { + case index.IterableIndex: + var i int + err := idx.ForEach(func(mh multihash.Multihash, offset uint64) error { + i++ + require.Equal(t, idmh, mh) + require.Equal(t, uint64(expectedOffset), offset) + return nil + }) + require.NoError(t, err) + require.Equal(t, 1, i) + default: + require.Failf(t, "unexpected index type", "wanted %v but got %v", multicodec.CarMultihashIndexSorted, idx.Codec()) + } +} + +func TestOpenReadWrite_ErrorsWhenWritingTooLargeOfACid(t *testing.T) { + maxAllowedCidSize := uint64(2) + path := filepath.Join(t.TempDir(), "readwrite-with-id-enabled-too-large.car") + subject, err := blockstore.OpenReadWrite(path, []cid.Cid{}, carv2.MaxIndexCidSize(maxAllowedCidSize)) + t.Cleanup(subject.Discard) + require.NoError(t, err) + + data := []byte("monsterlobster") + mh, err := multihash.Sum(data, multihash.SHA2_256, -1) + require.NoError(t, err) + bigCid := cid.NewCidV1(uint64(multicodec.Raw), mh) + bigCidLen := uint64(bigCid.ByteLen()) + require.True(t, bigCidLen > maxAllowedCidSize) + + bigBlock, err := blocks.NewBlockWithCid(data, bigCid) + require.NoError(t, err) + err = subject.Put(context.TODO(), bigBlock) + require.Equal(t, &carv2.ErrCidTooLarge{MaxSize: maxAllowedCidSize, CurrentSize: bigCidLen}, err) +} + +func TestReadWrite_ReWritingCARv1WithIdentityCidIsIdenticalToOriginalWithOptionsEnabled(t *testing.T) { + originalCARv1Path := "../testdata/sample-v1.car" + originalCarV1, err := os.Open(originalCARv1Path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, originalCarV1.Close()) }) + + r, err := carv2.NewBlockReader(originalCarV1) + require.NoError(t, err) + + path := filepath.Join(t.TempDir(), "readwrite-from-carv1-with-id-enabled.car") + subject, err := blockstore.OpenReadWrite(path, r.Roots, carv2.StoreIdentityCIDs(true)) + require.NoError(t, err) + var idCidCount int + for { + next, err := r.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + if next.Cid().Prefix().MhType == multihash.IDENTITY { + idCidCount++ + } + err = subject.Put(context.TODO(), next) + require.NoError(t, err) + } + require.NotZero(t, idCidCount) + err = subject.Finalize() + require.NoError(t, err) + + v2r, err := carv2.OpenReader(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v2r.Close()) }) + + // Assert characteristics bit is set. + require.True(t, v2r.Header.Characteristics.IsFullyIndexed()) + + // Assert original CARv1 and generated innter CARv1 payload have the same SHA512 hash + // Note, we hash instead of comparing bytes to avoid excessive memory usage when sample CARv1 is large. + + hasher := sha512.New() + dr, err := v2r.DataReader() + require.NoError(t, err) + gotWritten, err := io.Copy(hasher, dr) + require.NoError(t, err) + gotSum := hasher.Sum(nil) + + hasher.Reset() + _, err = originalCarV1.Seek(0, io.SeekStart) + require.NoError(t, err) + wantWritten, err := io.Copy(hasher, originalCarV1) + require.NoError(t, err) + wantSum := hasher.Sum(nil) + + require.Equal(t, wantWritten, gotWritten) + require.Equal(t, wantSum, gotSum) +} + +func TestReadWriteOpenFile(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + dir := t.TempDir() // auto cleanup + f, err := os.CreateTemp(dir, "") + require.NoError(t, err) + + root := blocks.NewBlock([]byte("foo")) + + bs, err := blockstore.OpenReadWriteFile(f, []cid.Cid{root.Cid()}) + require.NoError(t, err) + + err = bs.Put(ctx, root) + require.NoError(t, err) + + roots, err := bs.Roots() + require.NoError(t, err) + _, err = bs.Has(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.Get(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.NoError(t, err) + + err = bs.Finalize() + require.NoError(t, err) + + _, err = f.Seek(0, 0) + require.NoError(t, err) // file should not be closed, let the caller do it + + err = f.Close() + require.NoError(t, err) +} + +func TestBlockstore_IdentityCidWithEmptyDataIsIndexed(t *testing.T) { + p := path.Join(t.TempDir(), "car-id-cid-empty.carv2") + var noData []byte + + mh, err := multihash.Sum(noData, multihash.IDENTITY, -1) + require.NoError(t, err) + w, err := blockstore.OpenReadWrite(p, nil, carv2.StoreIdentityCIDs(true)) + require.NoError(t, err) + + blk, err := blocks.NewBlockWithCid(noData, cid.NewCidV1(cid.Raw, mh)) + require.NoError(t, err) + + err = w.Put(context.TODO(), blk) + require.NoError(t, err) + require.NoError(t, w.Finalize()) + + r, err := carv2.OpenReader(p) + require.NoError(t, err) + defer func() { require.NoError(t, r.Close()) }() + + dr, err := r.DataReader() + require.NoError(t, err) + header, err := carv1.ReadHeader(dr, carv1.DefaultMaxAllowedHeaderSize) + require.NoError(t, err) + wantOffset, err := carv1.HeaderSize(header) + require.NoError(t, err) + + ir, err := r.IndexReader() + require.NoError(t, err) + idx, err := index.ReadFrom(ir) + require.NoError(t, err) + + itidx, ok := idx.(index.IterableIndex) + require.True(t, ok) + var count int + err = itidx.ForEach(func(m multihash.Multihash, u uint64) error { + dm, err := multihash.Decode(m) + require.NoError(t, err) + require.Equal(t, multicodec.Identity, multicodec.Code(dm.Code)) + require.Equal(t, 0, dm.Length) + require.Empty(t, dm.Digest) + require.Equal(t, wantOffset, u) + count++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 1, count) +} + +func TestBlockstoreFinalizeReadOnly(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + root := blocks.NewBlock([]byte("foo")) + + p := filepath.Join(t.TempDir(), "readwrite.car") + bs, err := blockstore.OpenReadWrite(p, []cid.Cid{root.Cid()}) + require.NoError(t, err) + + err = bs.Put(ctx, root) + require.NoError(t, err) + + roots, err := bs.Roots() + require.NoError(t, err) + _, err = bs.Has(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.Get(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.AllKeysChan(ctx) + require.NoError(t, err) + + // soft finalize, we can still read, but not write + err = bs.FinalizeReadOnly() + require.NoError(t, err) + + _, err = bs.Roots() + require.NoError(t, err) + _, err = bs.Has(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.Get(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.NoError(t, err) + _, err = bs.AllKeysChan(ctx) + require.NoError(t, err) + + err = bs.Put(ctx, root) + require.Error(t, err) + + // final close, nothing works anymore + err = bs.Close() + require.NoError(t, err) + + _, err = bs.Roots() + require.Error(t, err) + _, err = bs.Has(ctx, roots[0]) + require.Error(t, err) + _, err = bs.Get(ctx, roots[0]) + require.Error(t, err) + _, err = bs.GetSize(ctx, roots[0]) + require.Error(t, err) + _, err = bs.AllKeysChan(ctx) + require.Error(t, err) + + err = bs.Put(ctx, root) + require.Error(t, err) +} + +func TestWholeCID(t *testing.T) { + for _, whole := range []bool{true, false} { + whole := whole + t.Run(fmt.Sprintf("whole=%t", whole), func(t *testing.T) { + t.Parallel() + ctx := context.Background() + path := filepath.Join(t.TempDir(), fmt.Sprintf("writable_%t.car", whole)) + rw, err := blockstore.OpenReadWrite(path, []cid.Cid{}, carv2.UseWholeCIDs(whole)) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, rw.Finalize()) }) + + require.NoError(t, rw.Put(ctx, oneTestBlockWithCidV1)) + has, err := rw.Has(ctx, oneTestBlockWithCidV1.Cid()) + require.NoError(t, err) + require.True(t, has) + + pref := oneTestBlockWithCidV1.Cid().Prefix() + pref.Codec = cid.DagCBOR + pref.Version = 1 + cpb1, err := pref.Sum(oneTestBlockWithCidV1.RawData()) + require.NoError(t, err) + + has, err = rw.Has(ctx, cpb1) + require.NoError(t, err) + require.Equal(t, has, !whole) + + require.NoError(t, rw.Put(ctx, anotherTestBlockWithCidV0)) + has, err = rw.Has(ctx, anotherTestBlockWithCidV0.Cid()) + require.NoError(t, err) + require.True(t, has) + has, err = rw.Has(ctx, cpb1) + require.NoError(t, err) + require.Equal(t, has, !whole) + + pref = anotherTestBlockWithCidV0.Cid().Prefix() + pref.Codec = cid.DagJSON + pref.Version = 1 + cpb2, err := pref.Sum(anotherTestBlockWithCidV0.RawData()) + require.NoError(t, err) + + has, err = rw.Has(ctx, cpb2) + require.NoError(t, err) + require.Equal(t, has, !whole) + has, err = rw.Has(ctx, cpb1) + require.NoError(t, err) + require.Equal(t, has, !whole) + }) + } +} + +func TestReadWriteIndex(t *testing.T) { + tmpPath := requireTmpCopy(t, "../testdata/sample-wrapped-v2.car") + + root := cid.MustParse("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy") + subject, err := blockstore.OpenReadWrite(tmpPath, []cid.Cid{root}) + require.NoError(t, err) + + defer func() { + err = subject.Finalize() + require.NoError(t, err) + }() + + var wantCids []cid.Cid + var wantMh []multihash.Multihash + ch, err := subject.AllKeysChan(context.Background()) + require.NoError(t, err) + for c := range ch { + wantCids = append(wantCids, c) + wantMh = append(wantMh, c.Hash()) + } + + idx := subject.Index() + + for _, c := range wantCids { + _, err = index.GetFirst(idx, c) + require.NoError(t, err) + } + + if idx, ok := idx.(index.IterableIndex); ok { + var got []multihash.Multihash + err = idx.ForEach(func(m multihash.Multihash, u uint64) error { + got = append(got, m) + return nil + }) + require.NoError(t, err) + require.ElementsMatch(t, wantMh, got) + } +} diff --git a/ipld/car/v2/car.go b/ipld/car/v2/car.go new file mode 100644 index 0000000000..571eb1140b --- /dev/null +++ b/ipld/car/v2/car.go @@ -0,0 +1,193 @@ +package car + +import ( + "encoding/binary" + "fmt" + "io" +) + +const ( + // PragmaSize is the size of the CARv2 pragma in bytes. + PragmaSize = 11 + // HeaderSize is the fixed size of CARv2 header in number of bytes. + HeaderSize = 40 + // CharacteristicsSize is the fixed size of Characteristics bitfield within CARv2 header in number of bytes. + CharacteristicsSize = 16 +) + +// The pragma of a CARv2, containing the version number. +// This is a valid CARv1 header, with version number of 2 and no root CIDs. +var Pragma = []byte{ + 0x0a, // unit(10) + 0xa1, // map(1) + 0x67, // string(7) + 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, // "version" + 0x02, // uint(2) +} + +type ( + // Header represents the CARv2 header/pragma. + Header struct { + // 128-bit characteristics of this CARv2 file, such as order, deduplication, etc. Reserved for future use. + Characteristics Characteristics + // The byte-offset from the beginning of the CARv2 to the first byte of the CARv1 data payload. + DataOffset uint64 + // The byte-length of the CARv1 data payload. + DataSize uint64 + // The byte-offset from the beginning of the CARv2 to the first byte of the index payload. This value may be 0 to indicate the absence of index data. + IndexOffset uint64 + } + // Characteristics is a bitfield placeholder for capturing the characteristics of a CARv2 such as order and determinism. + Characteristics struct { + Hi uint64 + Lo uint64 + } +) + +// fullyIndexedCharPos is the position of Characteristics.Hi bit that specifies whether the index is a catalog af all CIDs or not. +const fullyIndexedCharPos = 7 // left-most bit + +// WriteTo writes this characteristics to the given w. +func (c Characteristics) WriteTo(w io.Writer) (n int64, err error) { + buf := make([]byte, 16) + binary.LittleEndian.PutUint64(buf[:8], c.Hi) + binary.LittleEndian.PutUint64(buf[8:], c.Lo) + written, err := w.Write(buf) + return int64(written), err +} + +func (c *Characteristics) ReadFrom(r io.Reader) (int64, error) { + buf := make([]byte, CharacteristicsSize) + read, err := io.ReadFull(r, buf) + n := int64(read) + if err != nil { + return n, err + } + c.Hi = binary.LittleEndian.Uint64(buf[:8]) + c.Lo = binary.LittleEndian.Uint64(buf[8:]) + return n, nil +} + +// IsFullyIndexed specifies whether the index of CARv2 represents a catalog of all CID segments. +// See StoreIdentityCIDs +func (c *Characteristics) IsFullyIndexed() bool { + return isBitSet(c.Hi, fullyIndexedCharPos) +} + +// SetFullyIndexed sets whether of CARv2 represents a catalog of all CID segments. +func (c *Characteristics) SetFullyIndexed(b bool) { + if b { + c.Hi = setBit(c.Hi, fullyIndexedCharPos) + } else { + c.Hi = unsetBit(c.Hi, fullyIndexedCharPos) + } +} + +func setBit(n uint64, pos uint) uint64 { + n |= 1 << pos + return n +} + +func unsetBit(n uint64, pos uint) uint64 { + mask := uint64(^(1 << pos)) + n &= mask + return n +} + +func isBitSet(n uint64, pos uint) bool { + bit := n & (1 << pos) + return bit > 0 +} + +// NewHeader instantiates a new CARv2 header, given the data size. +func NewHeader(dataSize uint64) Header { + header := Header{ + DataSize: dataSize, + } + header.DataOffset = PragmaSize + HeaderSize + header.IndexOffset = header.DataOffset + dataSize + return header +} + +// WithIndexPadding sets the index offset from the beginning of the file for this header and returns +// the header for convenient chained calls. +// The index offset is calculated as the sum of PragmaSize, HeaderSize, +// Header.DataSize, and the given padding. +func (h Header) WithIndexPadding(padding uint64) Header { + h.IndexOffset = h.IndexOffset + padding + return h +} + +// WithDataPadding sets the data payload byte-offset from the beginning of the file for this header +// and returns the header for convenient chained calls. +// The Data offset is calculated as the sum of PragmaSize, HeaderSize and the given padding. +// The call to this function also shifts the Header.IndexOffset forward by the given padding. +func (h Header) WithDataPadding(padding uint64) Header { + h.DataOffset = PragmaSize + HeaderSize + padding + h.IndexOffset = h.IndexOffset + padding + return h +} + +func (h Header) WithDataSize(size uint64) Header { + h.DataSize = size + h.IndexOffset = size + h.IndexOffset + return h +} + +// HasIndex indicates whether the index is present. +func (h Header) HasIndex() bool { + return h.IndexOffset != 0 +} + +// WriteTo serializes this header as bytes and writes them using the given io.Writer. +func (h Header) WriteTo(w io.Writer) (n int64, err error) { + wn, err := h.Characteristics.WriteTo(w) + n += wn + if err != nil { + return + } + buf := make([]byte, 24) + binary.LittleEndian.PutUint64(buf[:8], h.DataOffset) + binary.LittleEndian.PutUint64(buf[8:16], h.DataSize) + binary.LittleEndian.PutUint64(buf[16:], h.IndexOffset) + written, err := w.Write(buf) + n += int64(written) + return n, err +} + +// ReadFrom populates fields of this header from the given r. +func (h *Header) ReadFrom(r io.Reader) (int64, error) { + n, err := h.Characteristics.ReadFrom(r) + if err != nil { + return n, err + } + buf := make([]byte, 24) + read, err := io.ReadFull(r, buf) + n += int64(read) + if err != nil { + return n, err + } + dataOffset := binary.LittleEndian.Uint64(buf[:8]) + dataSize := binary.LittleEndian.Uint64(buf[8:16]) + indexOffset := binary.LittleEndian.Uint64(buf[16:]) + // Assert the data payload offset validity. + // It must be at least 51 ( + ). + if int64(dataOffset) < PragmaSize+HeaderSize { + return n, fmt.Errorf("invalid data payload offset: %v", dataOffset) + } + // Assert the data size validity. + // It must be larger than zero. + // Technically, it should be at least 11 bytes (i.e. a valid CARv1 header with no roots) but + // we let further parsing of the header to signal invalid data payload header. + if int64(dataSize) <= 0 { + return n, fmt.Errorf("invalid data payload size: %v", dataSize) + } + // Assert the index offset validity. + if int64(indexOffset) < 0 { + return n, fmt.Errorf("invalid index offset: %v", indexOffset) + } + h.DataOffset = dataOffset + h.DataSize = dataSize + h.IndexOffset = indexOffset + return n, nil +} diff --git a/ipld/car/v2/car_test.go b/ipld/car/v2/car_test.go new file mode 100644 index 0000000000..d993a37468 --- /dev/null +++ b/ipld/car/v2/car_test.go @@ -0,0 +1,247 @@ +package car_test + +import ( + "bytes" + "testing" + + "github.com/stretchr/testify/require" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/stretchr/testify/assert" +) + +func TestCarV2PragmaLength(t *testing.T) { + tests := []struct { + name string + want interface{} + got interface{} + }{ + { + "ActualSizeShouldBe11", + 11, + len(carv2.Pragma), + }, + { + "ShouldStartWithVarint(10)", + carv2.Pragma[0], + 10, + }, + } + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + assert.EqualValues(t, tt.want, tt.got, "CarV2Pragma got = %v, want %v", tt.got, tt.want) + }) + } +} + +func TestCarV2PragmaIsValidCarV1Header(t *testing.T) { + v1h, err := carv1.ReadHeader(bytes.NewReader(carv2.Pragma), carv1.DefaultMaxAllowedHeaderSize) + assert.NoError(t, err, "cannot decode pragma as CBOR with CARv1 header structure") + assert.Equal(t, &carv1.CarHeader{ + Roots: nil, + Version: 2, + }, v1h, "CARv2 pragma must be a valid CARv1 header") +} + +func TestHeader_WriteTo(t *testing.T) { + tests := []struct { + name string + target carv2.Header + wantWrite []byte + wantErr bool + }{ + { + "HeaderWithEmptyCharacteristicsIsWrittenAsExpected", + carv2.Header{ + Characteristics: carv2.Characteristics{}, + DataOffset: 99, + }, + []byte{ + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x63, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }, + false, + }, + { + "NonEmptyHeaderIsWrittenAsExpected", + carv2.Header{ + Characteristics: carv2.Characteristics{ + Hi: 1001, Lo: 1002, + }, + DataOffset: 99, + DataSize: 100, + IndexOffset: 101, + }, + []byte{ + 0xe9, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0xea, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x63, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x64, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x65, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }, + false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + buf := &bytes.Buffer{} + written, err := tt.target.WriteTo(buf) + if (err != nil) != tt.wantErr { + t.Errorf("WriteTo() error = %v, wantErr %v", err, tt.wantErr) + return + } + gotWrite := buf.Bytes() + assert.Equal(t, tt.wantWrite, gotWrite, "Header.WriteTo() gotWrite = %v, wantWrite %v", gotWrite, tt.wantWrite) + assert.EqualValues(t, carv2.HeaderSize, uint64(len(gotWrite)), "WriteTo() CARv2 header length must always be %v bytes long", carv2.HeaderSize) + assert.EqualValues(t, carv2.HeaderSize, uint64(written), "WriteTo() CARv2 header byte count must always be %v bytes long", carv2.HeaderSize) + }) + } +} + +func TestHeader_ReadFrom(t *testing.T) { + tests := []struct { + name string + target []byte + wantHeader carv2.Header + wantErr bool + }{ + { + "HeaderWithEmptyCharacteristicsIsWrittenAsExpected", + []byte{ + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x63, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x64, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }, + carv2.Header{ + Characteristics: carv2.Characteristics{}, + DataOffset: 99, + DataSize: 100, + }, + false, + }, + { + "NonEmptyHeaderIsWrittenAsExpected", + + []byte{ + 0xe9, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0xea, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x63, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x64, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x65, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }, + carv2.Header{ + Characteristics: carv2.Characteristics{ + Hi: 1001, Lo: 1002, + }, + DataOffset: 99, + DataSize: 100, + IndexOffset: 101, + }, + false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotHeader := carv2.Header{} + gotRead, err := gotHeader.ReadFrom(bytes.NewReader(tt.target)) + assert.NoError(t, err) + assert.Equal(t, int64(carv2.HeaderSize), gotRead) + assert.Equal(t, tt.wantHeader, gotHeader) + }) + } +} + +func TestHeader_WithPadding(t *testing.T) { + tests := []struct { + name string + subject carv2.Header + wantCarV1Offset uint64 + wantIndexOffset uint64 + }{ + { + "WhenNoPaddingOffsetsAreSumOfSizes", + carv2.NewHeader(123), + carv2.PragmaSize + carv2.HeaderSize, + carv2.PragmaSize + carv2.HeaderSize + 123, + }, + { + "WhenOnlyPaddingCarV1BothOffsetsShift", + carv2.NewHeader(123).WithDataPadding(3), + carv2.PragmaSize + carv2.HeaderSize + 3, + carv2.PragmaSize + carv2.HeaderSize + 3 + 123, + }, + { + "WhenPaddingBothCarV1AndIndexBothOffsetsShiftWithAdditionalIndexShift", + carv2.NewHeader(123).WithDataPadding(3).WithIndexPadding(7), + carv2.PragmaSize + carv2.HeaderSize + 3, + carv2.PragmaSize + carv2.HeaderSize + 3 + 123 + 7, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.EqualValues(t, tt.wantCarV1Offset, tt.subject.DataOffset) + assert.EqualValues(t, tt.wantIndexOffset, tt.subject.IndexOffset) + }) + } +} + +func TestNewHeaderHasExpectedValues(t *testing.T) { + wantCarV1Len := uint64(1413) + want := carv2.Header{ + Characteristics: carv2.Characteristics{}, + DataOffset: carv2.PragmaSize + carv2.HeaderSize, + DataSize: wantCarV1Len, + IndexOffset: carv2.PragmaSize + carv2.HeaderSize + wantCarV1Len, + } + got := carv2.NewHeader(wantCarV1Len) + assert.Equal(t, want, got, "NewHeader got = %v, want = %v", got, want) +} + +func TestCharacteristics_StoreIdentityCIDs(t *testing.T) { + subject := carv2.Characteristics{} + require.False(t, subject.IsFullyIndexed()) + + subject.SetFullyIndexed(true) + require.True(t, subject.IsFullyIndexed()) + + var buf bytes.Buffer + written, err := subject.WriteTo(&buf) + require.NoError(t, err) + require.Equal(t, int64(16), written) + require.Equal(t, []byte{ + 0x80, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }, buf.Bytes()) + + var decodedSubject carv2.Characteristics + read, err := decodedSubject.ReadFrom(&buf) + require.NoError(t, err) + require.Equal(t, int64(16), read) + require.True(t, decodedSubject.IsFullyIndexed()) + + buf.Reset() + subject.SetFullyIndexed(false) + require.False(t, subject.IsFullyIndexed()) + + written, err = subject.WriteTo(&buf) + require.NoError(t, err) + require.Equal(t, int64(16), written) + require.Equal(t, []byte{ + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }, buf.Bytes()) + + var decodedSubjectAgain carv2.Characteristics + read, err = decodedSubjectAgain.ReadFrom(&buf) + require.NoError(t, err) + require.Equal(t, int64(16), read) + require.False(t, decodedSubjectAgain.IsFullyIndexed()) +} diff --git a/ipld/car/v2/doc.go b/ipld/car/v2/doc.go new file mode 100644 index 0000000000..b12266649a --- /dev/null +++ b/ipld/car/v2/doc.go @@ -0,0 +1,8 @@ +// Package car allows inspecting and reading CAR files, +// described at https://ipld.io/specs/transport/car/. +// The entire library is geared towards the CARv2 spec, +// but many of the APIs consuming CAR files also accept CARv1. +// +// The blockstore sub-package contains an implementation of the +// go-ipfs-blockstore interface. +package car diff --git a/ipld/car/v2/errors.go b/ipld/car/v2/errors.go new file mode 100644 index 0000000000..ee89e0b25a --- /dev/null +++ b/ipld/car/v2/errors.go @@ -0,0 +1,18 @@ +package car + +import ( + "fmt" +) + +var _ (error) = (*ErrCidTooLarge)(nil) + +// ErrCidTooLarge signals that a CID is too large to include in CARv2 index. +// See: MaxIndexCidSize. +type ErrCidTooLarge struct { + MaxSize uint64 + CurrentSize uint64 +} + +func (e *ErrCidTooLarge) Error() string { + return fmt.Sprintf("cid size is larger than max allowed (%d > %d)", e.CurrentSize, e.MaxSize) +} diff --git a/ipld/car/v2/errors_test.go b/ipld/car/v2/errors_test.go new file mode 100644 index 0000000000..56e2c7a095 --- /dev/null +++ b/ipld/car/v2/errors_test.go @@ -0,0 +1,12 @@ +package car + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNewErrCidTooLarge_ErrorContainsSizes(t *testing.T) { + subject := &ErrCidTooLarge{MaxSize: 1413, CurrentSize: 1414} + require.EqualError(t, subject, "cid size is larger than max allowed (1414 > 1413)") +} diff --git a/ipld/car/v2/example_test.go b/ipld/car/v2/example_test.go new file mode 100644 index 0000000000..532f87d6a5 --- /dev/null +++ b/ipld/car/v2/example_test.go @@ -0,0 +1,133 @@ +package car_test + +import ( + "bytes" + "context" + "fmt" + "io" + "os" + "path/filepath" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" +) + +func ExampleWrapV1File() { + // We have a sample CARv1 file. + // Wrap it as-is in a CARv2, with an index. + // Writing the result to testdata allows reusing that file in other tests, + // and also helps ensure that the result is deterministic. + src := "testdata/sample-v1.car" + tdir, err := os.MkdirTemp(os.TempDir(), "example-*") + if err != nil { + panic(err) + } + dst := filepath.Join(tdir, "wrapped-v2.car") + if err := carv2.WrapV1File(src, dst); err != nil { + panic(err) + } + + // Open our new CARv2 file and show some info about it. + cr, err := carv2.OpenReader(dst) + if err != nil { + panic(err) + } + defer func() { + if err := cr.Close(); err != nil { + panic(err) + } + }() + + roots, err := cr.Roots() + if err != nil { + panic(err) + } + fmt.Println("Roots:", roots) + fmt.Println("Has index:", cr.Header.HasIndex()) + + // Verify that the CARv1 remains exactly the same. + orig, err := os.ReadFile(src) + if err != nil { + panic(err) + } + dr, err := cr.DataReader() + if err != nil { + panic(err) + } + inner, err := io.ReadAll(dr) + if err != nil { + panic(err) + } + fmt.Println("Inner CARv1 is exactly the same:", bytes.Equal(orig, inner)) + + // Verify that the CARv2 works well with its index. + bs, err := blockstore.OpenReadOnly(dst) + if err != nil { + panic(err) + } + fmt.Println(bs.Get(context.TODO(), roots[0])) + + // Output: + // Roots: [bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy] + // Has index: true + // Inner CARv1 is exactly the same: true + // [Block bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy] +} + +// ExampleNewBlockReader instantiates a new BlockReader for a CARv1 file and its wrapped CARv2 +// version. For each file, it prints the version, the root CIDs and the first five block CIDs. +// Note, the roots and first five block CIDs are identical in both files since both represent the +// same root CIDs and data blocks. +func ExampleNewBlockReader() { + for _, path := range []string{ + "testdata/sample-v1.car", + "testdata/sample-wrapped-v2.car", + } { + fmt.Println("File:", path) + f, err := os.Open(path) + if err != nil { + panic(err) + } + br, err := carv2.NewBlockReader(f) + if err != nil { + panic(err) + } + defer func() { + if err := f.Close(); err != nil { + panic(err) + } + }() + fmt.Println("Version:", br.Version) + fmt.Println("Roots:", br.Roots) + fmt.Println("First 5 block CIDs:") + for i := 0; i < 5; i++ { + bl, err := br.Next() + if err == io.EOF { + break + } + if err != nil { + panic(err) + } + fmt.Printf("\t%v\n", bl.Cid()) + } + } + // Output: + // File: testdata/sample-v1.car + // Version: 1 + // Roots: [bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy] + // First 5 block CIDs: + // bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy + // bafy2bzaceaycv7jhaegckatnncu5yugzkrnzeqsppzegufr35lroxxnsnpspu + // bafy2bzaceb62wdepofqu34afqhbcn4a7jziwblt2ih5hhqqm6zitd3qpzhdp4 + // bafy2bzaceb3utcspm5jqcdqpih3ztbaztv7yunzkiyfq7up7xmokpxemwgu5u + // bafy2bzacedjwekyjresrwjqj4n2r5bnuuu3klncgjo2r3slsp6wgqb37sz4ck + // File: testdata/sample-wrapped-v2.car + // Version: 2 + // Roots: [bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy] + // First 5 block CIDs: + // bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy + // bafy2bzaceaycv7jhaegckatnncu5yugzkrnzeqsppzegufr35lroxxnsnpspu + // bafy2bzaceb62wdepofqu34afqhbcn4a7jziwblt2ih5hhqqm6zitd3qpzhdp4 + // bafy2bzaceb3utcspm5jqcdqpih3ztbaztv7yunzkiyfq7up7xmokpxemwgu5u + // bafy2bzacedjwekyjresrwjqj4n2r5bnuuu3klncgjo2r3slsp6wgqb37sz4ck +} diff --git a/ipld/car/v2/fuzz_test.go b/ipld/car/v2/fuzz_test.go new file mode 100644 index 0000000000..32c548547a --- /dev/null +++ b/ipld/car/v2/fuzz_test.go @@ -0,0 +1,141 @@ +//go:build go1.18 + +package car_test + +import ( + "bytes" + "encoding/hex" + "io" + "os" + "path/filepath" + "testing" + + car "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/internal/carv1" +) + +// v1FixtureStr is a clean carv1 single-block, single-root CAR +const v1FixtureStr = "3aa265726f6f747381d82a58250001711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e012c01711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80ba165646f646779f5" + +func seedWithCarFiles(f *testing.F) { + fixture, err := hex.DecodeString(v1FixtureStr) + if err != nil { + f.Fatal(err) + } + f.Add(fixture) + files, err := filepath.Glob("testdata/*.car") + if err != nil { + f.Fatal(err) + } + for _, fname := range files { + func() { + file, err := os.Open(fname) + if err != nil { + f.Fatal(err) + } + defer file.Close() + data, err := io.ReadAll(file) + if err != nil { + f.Fatal(err) + } + f.Add(data) + }() + } +} + +func FuzzBlockReader(f *testing.F) { + seedWithCarFiles(f) + + f.Fuzz(func(t *testing.T, data []byte) { + r, err := car.NewBlockReader(bytes.NewReader(data)) + if err != nil { + return + } + + for { + _, err = r.Next() + if err == io.EOF { + return + } + } + }) +} + +func FuzzReader(f *testing.F) { + seedWithCarFiles(f) + + f.Fuzz(func(t *testing.T, data []byte) { + subject, err := car.NewReader(bytes.NewReader(data)) + if err != nil { + return + } + + subject.Roots() + _, err = subject.IndexReader() + if err != nil { + return + } + dr, err := subject.DataReader() + if err != nil { + return + } + car.GenerateIndex(dr) + }) +} + +func FuzzInspect(f *testing.F) { + seedWithCarFiles(f) + + f.Fuzz(func(t *testing.T, data []byte) { + reader, err := car.NewReader(bytes.NewReader(data)) + if err != nil { + return + } + + // Do differential fuzzing between Inspect and the normal parser + _, inspectErr := reader.Inspect(true) + if inspectErr == nil { + return + } + + reader, err = car.NewReader(bytes.NewReader(data)) + if err != nil { + t.Fatal("second NewReader on same data failed", err.Error()) + } + i, err := reader.IndexReader() + if err != nil { + return + } + // FIXME: Once indexes are safe to parse, do not skip .car with index in the differential fuzzing. + if i == nil { + return + } + dr, err := reader.DataReader() + if err != nil { + return + } + + _, err = carv1.ReadHeader(dr, carv1.DefaultMaxAllowedHeaderSize) + if err != nil { + return + } + + blocks, err := car.NewBlockReader(dr) + if err != nil { + return + } + + for { + _, err := blocks.Next() + if err != nil { + if err == io.EOF { + break + } + // caught error as expected + return + } + } + + t.Fatal("Inspect found error but we red this file correctly:", inspectErr.Error()) + }) +} diff --git a/ipld/car/v2/index/doc.go b/ipld/car/v2/index/doc.go new file mode 100644 index 0000000000..b8062cc186 --- /dev/null +++ b/ipld/car/v2/index/doc.go @@ -0,0 +1,6 @@ +// package index provides indexing functionality for CARv1 data payload represented as a mapping of +// CID to offset. This can then be used to implement random access over a CARv1. +// +// Index can be written or read using the following static functions: index.WriteTo and +// index.ReadFrom. +package index diff --git a/ipld/car/v2/index/errors.go b/ipld/car/v2/index/errors.go new file mode 100644 index 0000000000..1a10a98467 --- /dev/null +++ b/ipld/car/v2/index/errors.go @@ -0,0 +1,6 @@ +package index + +import "errors" + +// ErrNotFound signals a record is not found in the index. +var ErrNotFound = errors.New("not found") diff --git a/ipld/car/v2/index/example_test.go b/ipld/car/v2/index/example_test.go new file mode 100644 index 0000000000..2b9cca7c66 --- /dev/null +++ b/ipld/car/v2/index/example_test.go @@ -0,0 +1,113 @@ +package index_test + +import ( + "fmt" + "io" + "os" + "reflect" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" +) + +// ExampleReadFrom unmarshalls an index from an indexed CARv2 file, and for each root CID prints the +// offset at which its corresponding block starts relative to the wrapped CARv1 data payload. +func ExampleReadFrom() { + // Open the CARv2 file + cr, err := carv2.OpenReader("../testdata/sample-wrapped-v2.car") + if err != nil { + panic(err) + } + defer cr.Close() + + // Get root CIDs in the CARv1 file. + roots, err := cr.Roots() + if err != nil { + panic(err) + } + + // Read and unmarshall index within CARv2 file. + ir, err := cr.IndexReader() + if err != nil { + panic(err) + } + idx, err := index.ReadFrom(ir) + if err != nil { + panic(err) + } + + // For each root CID print the offset relative to CARv1 data payload. + for _, r := range roots { + offset, err := index.GetFirst(idx, r) + if err != nil { + panic(err) + } + fmt.Printf("Frame with CID %v starts at offset %v relative to CARv1 data payload.\n", r, offset) + } + + // Output: + // Frame with CID bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy starts at offset 61 relative to CARv1 data payload. +} + +// ExampleWriteTo unmarshalls an index from an indexed CARv2 file, and stores it as a separate +// file on disk. +func ExampleWriteTo() { + // Open the CARv2 file + src := "../testdata/sample-wrapped-v2.car" + cr, err := carv2.OpenReader(src) + if err != nil { + panic(err) + } + defer func() { + if err := cr.Close(); err != nil { + panic(err) + } + }() + + // Read and unmarshall index within CARv2 file. + ir, err := cr.IndexReader() + if err != nil { + panic(err) + } + idx, err := index.ReadFrom(ir) + if err != nil { + panic(err) + } + + // Store the index alone onto destination file. + f, err := os.CreateTemp(os.TempDir(), "example-index-*.carindex") + if err != nil { + panic(err) + } + defer func() { + if err := f.Close(); err != nil { + panic(err) + } + }() + _, err = index.WriteTo(idx, f) + if err != nil { + panic(err) + } + + // Seek to the beginning of tile to read it back. + _, err = f.Seek(0, io.SeekStart) + if err != nil { + panic(err) + } + + // Read and unmarshall the destination file as a separate index instance. + reReadIdx, err := index.ReadFrom(f) + if err != nil { + panic(err) + } + + // Expect indices to be equal. + if reflect.DeepEqual(idx, reReadIdx) { + fmt.Printf("Saved index file matches the index embedded in CARv2 at %v.\n", src) + } else { + panic("expected to get the same index as the CARv2 file") + } + + // Output: + // Saved index file matches the index embedded in CARv2 at ../testdata/sample-wrapped-v2.car. +} diff --git a/ipld/car/v2/index/index.go b/ipld/car/v2/index/index.go new file mode 100644 index 0000000000..af16fcc1e1 --- /dev/null +++ b/ipld/car/v2/index/index.go @@ -0,0 +1,160 @@ +package index + +import ( + "encoding/binary" + "fmt" + "io" + + "github.com/ipfs/go-cid" + internalio "github.com/ipld/go-car/v2/internal/io" + + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" +) + +// CarIndexNone is a sentinel value used as a multicodec code for the index indicating no index. +const CarIndexNone = 0x300000 + +type ( + // Record is a pre-processed record of a car item and location. + Record struct { + cid.Cid + Offset uint64 + } + + // Index provides an interface for looking up byte offset of a given CID. + // + // Note that each indexing mechanism is free to match CIDs however it + // sees fit. For example, multicodec.CarIndexSorted only indexes + // multihash digests, meaning that Get and GetAll will find matching + // blocks even if the CID's encoding multicodec differs. Other index + // implementations might index the entire CID, the entire multihash, or + // just part of a multihash's digest. + // + // See: multicodec.CarIndexSorted, multicodec.CarMultihashIndexSorted + Index interface { + // Codec provides the multicodec code that the index implements. + // + // Note that this may return a reserved code if the index + // implementation is not defined in a spec. + Codec() multicodec.Code + + // Marshal encodes the index in serial form. + Marshal(w io.Writer) (uint64, error) + + // Unmarshal decodes the index from its serial form. + // Note, this function will copy the entire index into memory. + // + // Do not unmarshal index from untrusted CARv2 files. Instead, the index should be + // regenerated from the CARv2 data payload. + Unmarshal(r io.Reader) error + + // Load inserts a number of records into the index. + // Note that Index will load all given records. Any filtering of the records such as + // exclusion of CIDs with multihash.IDENTITY code must occur prior to calling this function. + // + // Further, the actual information extracted and indexed from the given records entirely + // depends on the concrete index implementation. + // For example, some index implementations may only store partial multihashes. + Load([]Record) error + + // GetAll looks up all blocks matching a given CID, + // calling a function for each one of their offsets. + // + // GetAll stops if the given function returns false, + // or there are no more offsets; whichever happens first. + // + // If no error occurred and the CID isn't indexed, + // meaning that no callbacks happen, + // ErrNotFound is returned. + GetAll(cid.Cid, func(uint64) bool) error + } + + // IterableIndex is an index which support iterating over it's elements + IterableIndex interface { + Index + + // ForEach takes a callback function that will be called + // on each entry in the index. The arguments to the callback are + // the multihash of the element, and the offset in the car file + // where the element appears. + // + // If the callback returns a non-nil error, the iteration is aborted, + // and the ForEach function returns the error to the user. + // + // An index may contain multiple offsets corresponding to the same multihash, e.g. via duplicate blocks. + // In such cases, the given function may be called multiple times with the same multihash but different offset. + // + // The order of calls to the given function is deterministic, but entirely index-specific. + ForEach(func(multihash.Multihash, uint64) error) error + } +) + +// GetFirst is a wrapper over Index.GetAll, returning the offset for the first +// matching indexed CID. +func GetFirst(idx Index, key cid.Cid) (uint64, error) { + var firstOffset uint64 + err := idx.GetAll(key, func(offset uint64) bool { + firstOffset = offset + return false + }) + return firstOffset, err +} + +// New constructs a new index corresponding to the given CAR index codec. +func New(codec multicodec.Code) (Index, error) { + switch codec { + case multicodec.CarIndexSorted: + return newSorted(), nil + case multicodec.CarMultihashIndexSorted: + return NewMultihashSorted(), nil + default: + return nil, fmt.Errorf("unknwon index codec: %v", codec) + } +} + +// WriteTo writes the given idx into w. +// The written bytes include the index encoding. +// This can then be read back using index.ReadFrom +func WriteTo(idx Index, w io.Writer) (uint64, error) { + buf := make([]byte, binary.MaxVarintLen64) + b := varint.PutUvarint(buf, uint64(idx.Codec())) + n, err := w.Write(buf[:b]) + if err != nil { + return uint64(n), err + } + + l, err := idx.Marshal(w) + return uint64(n) + l, err +} + +// ReadFrom reads index from r. +// The reader decodes the index by reading the first byte to interpret the encoding. +// Returns error if the encoding is not known. +// +// Attempting to read index data from untrusted sources is not recommended. +// Instead, the index should be regenerated from the CARv2 data payload. +func ReadFrom(r io.Reader) (Index, error) { + codec, err := ReadCodec(r) + if err != nil { + return nil, err + } + idx, err := New(codec) + if err != nil { + return nil, err + } + if err := idx.Unmarshal(r); err != nil { + return nil, err + } + return idx, nil +} + +// ReadCodec reads the codec of the index by decoding the first varint read from r. +func ReadCodec(r io.Reader) (multicodec.Code, error) { + code, err := varint.ReadUvarint(internalio.ToByteReader(r)) + if err != nil { + return 0, err + } + return multicodec.Code(code), nil +} diff --git a/ipld/car/v2/index/index_test.go b/ipld/car/v2/index/index_test.go new file mode 100644 index 0000000000..b7f83d3c48 --- /dev/null +++ b/ipld/car/v2/index/index_test.go @@ -0,0 +1,168 @@ +package index + +import ( + "bytes" + "io" + "os" + "path/filepath" + "testing" + + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/carv1/util" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-varint" + "github.com/stretchr/testify/require" +) + +func TestNew(t *testing.T) { + tests := []struct { + name string + codec multicodec.Code + want Index + wantErr bool + }{ + { + name: "CarSortedIndexCodecIsConstructed", + codec: multicodec.CarIndexSorted, + want: newSorted(), + }, + { + name: "ValidMultiCodecButUnknwonToIndexIsError", + codec: multicodec.Cidv1, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := New(tt.codec) + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + require.Equal(t, tt.want, got) + } + }) + } +} + +func TestReadFrom(t *testing.T) { + idxf, err := os.Open("../testdata/sample-index.carindex") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, idxf.Close()) }) + + subject, err := ReadFrom(idxf) + require.NoError(t, err) + + _, err = idxf.Seek(0, io.SeekStart) + require.NoError(t, err) + + idxf2, err := os.Open("../testdata/sample-multihash-index-sorted.carindex") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, idxf2.Close()) }) + + subjectInAltFormat, err := ReadFrom(idxf) + require.NoError(t, err) + + crf, err := os.Open("../testdata/sample-v1.car") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, crf.Close()) }) + cr, err := carv1.NewCarReader(crf) + require.NoError(t, err) + + for { + wantBlock, err := cr.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + + wantCid := wantBlock.Cid() + // Get offset from the index for a CID and assert it exists + gotOffset, err := GetFirst(subject, wantCid) + require.NoError(t, err) + require.NotZero(t, gotOffset) + + // Get offset from the index in alternative format for a CID and assert it exists + gotOffset2, err := GetFirst(subjectInAltFormat, wantCid) + require.NoError(t, err) + require.NotZero(t, gotOffset2) + + // Seek to the offset on CARv1 file + _, err = crf.Seek(int64(gotOffset), io.SeekStart) + require.NoError(t, err) + + // Read the fame at offset and assert the frame corresponds to the expected block. + gotCid, gotData, err := util.ReadNode(crf, false, carv1.DefaultMaxAllowedSectionSize) + require.NoError(t, err) + gotBlock, err := blocks.NewBlockWithCid(gotData, gotCid) + require.NoError(t, err) + require.Equal(t, wantBlock, gotBlock) + } +} + +func TestWriteTo(t *testing.T) { + // Read sample index on file + idxf, err := os.Open("../testdata/sample-multihash-index-sorted.carindex") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, idxf.Close()) }) + + // Unmarshall to get expected index + wantIdx, err := ReadFrom(idxf) + require.NoError(t, err) + + // Write the same index out + dest := filepath.Join(t.TempDir(), "index-write-to-test.carindex") + destF, err := os.Create(dest) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, destF.Close()) }) + _, err = WriteTo(wantIdx, destF) + require.NoError(t, err) + + // Seek to the beginning of the written out file. + _, err = destF.Seek(0, io.SeekStart) + require.NoError(t, err) + + // Read the written index back + gotIdx, err := ReadFrom(destF) + require.NoError(t, err) + + // Assert they are equal + require.Equal(t, wantIdx, gotIdx) +} + +func TestMarshalledIndexStartsWithCodec(t *testing.T) { + + tests := []struct { + path string + codec multicodec.Code + }{ + { + path: "../testdata/sample-multihash-index-sorted.carindex", + codec: multicodec.CarMultihashIndexSorted, + }, + { + path: "../testdata/sample-index.carindex", + codec: multicodec.CarIndexSorted, + }, + } + for _, test := range tests { + test := test + t.Run(test.codec.String(), func(t *testing.T) { + // Read sample index on file + idxf, err := os.Open(test.path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, idxf.Close()) }) + + // Unmarshall to get expected index + wantIdx, err := ReadFrom(idxf) + require.NoError(t, err) + + // Assert the first two bytes are the corresponding multicodec code. + buf := new(bytes.Buffer) + _, err = WriteTo(wantIdx, buf) + require.NoError(t, err) + require.Equal(t, varint.ToUvarint(uint64(test.codec)), buf.Bytes()[:2]) + }) + } +} diff --git a/ipld/car/v2/index/indexsorted.go b/ipld/car/v2/index/indexsorted.go new file mode 100644 index 0000000000..ab1462dc43 --- /dev/null +++ b/ipld/car/v2/index/indexsorted.go @@ -0,0 +1,319 @@ +package index + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "sort" + + internalio "github.com/ipld/go-car/v2/internal/io" + + "github.com/multiformats/go-multicodec" + + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multihash" +) + +var _ Index = (*multiWidthIndex)(nil) + +type ( + digestRecord struct { + digest []byte + index uint64 + } + recordSet []digestRecord + singleWidthIndex struct { + width uint32 + len uint64 // in struct, len is #items. when marshaled, it's saved as #bytes. + index []byte + } + multiWidthIndex map[uint32]singleWidthIndex +) + +func (d digestRecord) write(buf []byte) { + n := copy(buf[:], d.digest) + binary.LittleEndian.PutUint64(buf[n:], d.index) +} + +func (r recordSet) Len() int { + return len(r) +} + +func (r recordSet) Less(i, j int) bool { + return bytes.Compare(r[i].digest, r[j].digest) < 0 +} + +func (r recordSet) Swap(i, j int) { + r[i], r[j] = r[j], r[i] +} + +func (s *singleWidthIndex) Marshal(w io.Writer) (uint64, error) { + l := uint64(0) + if err := binary.Write(w, binary.LittleEndian, s.width); err != nil { + return 0, err + } + l += 4 + if err := binary.Write(w, binary.LittleEndian, int64(len(s.index))); err != nil { + return l, err + } + l += 8 + n, err := w.Write(s.index) + return l + uint64(n), err +} + +func (s *singleWidthIndex) Unmarshal(r io.Reader) error { + var width uint32 + if err := binary.Read(r, binary.LittleEndian, &width); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + var dataLen uint64 + if err := binary.Read(r, binary.LittleEndian, &dataLen); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + + if err := s.checkUnmarshalLengths(width, dataLen, 0); err != nil { + return err + } + + buf := make([]byte, dataLen) + if _, err := io.ReadFull(r, buf); err != nil { + return err + } + s.index = buf + return nil +} + +func (s *singleWidthIndex) checkUnmarshalLengths(width uint32, dataLen, extra uint64) error { + if width < 8 { + return errors.New("malformed index; width must be at least 8") + } + const maxWidth = 32 << 20 // 32MiB, to ~match the go-cid maximum + if width > maxWidth { + return errors.New("index too big; singleWidthIndex width is larger than allowed maximum") + } + oldDataLen, dataLen := dataLen, dataLen+extra + if oldDataLen > dataLen { + return errors.New("index too big; singleWidthIndex len is overflowing") + } + if int64(dataLen) < 0 { + return errors.New("index too big; singleWidthIndex len is overflowing int64") + } + s.width = width + s.len = dataLen / uint64(width) + return nil +} + +func (s *singleWidthIndex) Less(i int, digest []byte) bool { + return bytes.Compare(digest[:], s.index[i*int(s.width):((i+1)*int(s.width)-8)]) <= 0 +} + +func (s *singleWidthIndex) GetAll(c cid.Cid, fn func(uint64) bool) error { + d, err := multihash.Decode(c.Hash()) + if err != nil { + return err + } + return s.getAll(d.Digest, fn) +} + +func (s *singleWidthIndex) getAll(d []byte, fn func(uint64) bool) error { + idx := sort.Search(int(s.len), func(i int) bool { + return s.Less(i, d) + }) + + var any bool + for ; uint64(idx) < s.len; idx++ { + digestStart := idx * int(s.width) + offsetEnd := (idx + 1) * int(s.width) + digestEnd := offsetEnd - 8 + if bytes.Equal(d[:], s.index[digestStart:digestEnd]) { + any = true + offset := binary.LittleEndian.Uint64(s.index[digestEnd:offsetEnd]) + if !fn(offset) { + // User signalled to stop searching; therefore, break. + break + } + } else { + // No more matches; therefore, break. + break + } + } + if !any { + return ErrNotFound + } + return nil +} + +func (s *singleWidthIndex) Load(items []Record) error { + m := make(multiWidthIndex) + if err := m.Load(items); err != nil { + return err + } + if len(m) != 1 { + return fmt.Errorf("unexpected number of cid widths: %d", len(m)) + } + for _, i := range m { + s.index = i.index + s.len = i.len + s.width = i.width + return nil + } + return nil +} + +func (s *singleWidthIndex) forEachDigest(f func(digest []byte, offset uint64) error) error { + segmentCount := len(s.index) / int(s.width) + for i := 0; i < segmentCount; i++ { + digestStart := i * int(s.width) + offsetEnd := (i + 1) * int(s.width) + digestEnd := offsetEnd - 8 + digest := s.index[digestStart:digestEnd] + offset := binary.LittleEndian.Uint64(s.index[digestEnd:offsetEnd]) + if err := f(digest, offset); err != nil { + return err + } + } + return nil +} + +func (m *multiWidthIndex) GetAll(c cid.Cid, fn func(uint64) bool) error { + d, err := multihash.Decode(c.Hash()) + if err != nil { + return err + } + if s, ok := (*m)[uint32(len(d.Digest)+8)]; ok { + return s.getAll(d.Digest, fn) + } + return ErrNotFound +} + +func (m *multiWidthIndex) Codec() multicodec.Code { + return multicodec.CarIndexSorted +} + +func (m *multiWidthIndex) Marshal(w io.Writer) (uint64, error) { + l := uint64(0) + if err := binary.Write(w, binary.LittleEndian, int32(len(*m))); err != nil { + return l, err + } + l += 4 + + // The widths are unique, but ranging over a map isn't deterministic. + // As per the CARv2 spec, we must order buckets by digest length. + + widths := make([]uint32, 0, len(*m)) + for width := range *m { + widths = append(widths, width) + } + sort.Slice(widths, func(i, j int) bool { + return widths[i] < widths[j] + }) + + for _, width := range widths { + bucket := (*m)[width] + n, err := bucket.Marshal(w) + l += n + if err != nil { + return l, err + } + } + return l, nil +} + +func (m *multiWidthIndex) Unmarshal(r io.Reader) error { + reader := internalio.ToByteReadSeeker(r) + var l int32 + if err := binary.Read(reader, binary.LittleEndian, &l); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + sum, err := reader.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + if int32(l) < 0 { + return errors.New("index too big; multiWidthIndex count is overflowing int32") + } + for i := 0; i < int(l); i++ { + s := singleWidthIndex{} + if err := s.Unmarshal(r); err != nil { + return err + } + n, err := reader.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + oldSum := sum + sum += n + if sum < oldSum { + return errors.New("index too big; multiWidthIndex len is overflowing int64") + } + (*m)[s.width] = s + } + return nil +} + +func (m *multiWidthIndex) Load(items []Record) error { + // Split cids on their digest length + idxs := make(map[int][]digestRecord) + for _, item := range items { + decHash, err := multihash.Decode(item.Hash()) + if err != nil { + return err + } + + digest := decHash.Digest + idx, ok := idxs[len(digest)] + if !ok { + idxs[len(digest)] = make([]digestRecord, 0) + idx = idxs[len(digest)] + } + idxs[len(digest)] = append(idx, digestRecord{digest, item.Offset}) + } + + // Sort each list. then write to compact form. + for width, lst := range idxs { + sort.Sort(recordSet(lst)) + rcrdWdth := width + 8 + compact := make([]byte, rcrdWdth*len(lst)) + for off, itm := range lst { + itm.write(compact[off*rcrdWdth : (off+1)*rcrdWdth]) + } + s := singleWidthIndex{ + width: uint32(rcrdWdth), + len: uint64(len(lst)), + index: compact, + } + (*m)[uint32(width)+8] = s + } + return nil +} + +func (m *multiWidthIndex) forEachDigest(f func(digest []byte, offset uint64) error) error { + sizes := make([]uint32, 0, len(*m)) + for k := range *m { + sizes = append(sizes, k) + } + sort.Slice(sizes, func(i, j int) bool { return sizes[i] < sizes[j] }) + for _, s := range sizes { + swi := (*m)[s] + if err := swi.forEachDigest(f); err != nil { + return err + } + } + return nil +} + +func newSorted() Index { + m := make(multiWidthIndex) + return &m +} diff --git a/ipld/car/v2/index/indexsorted_test.go b/ipld/car/v2/index/indexsorted_test.go new file mode 100644 index 0000000000..5c1ee44956 --- /dev/null +++ b/ipld/car/v2/index/indexsorted_test.go @@ -0,0 +1,64 @@ +package index + +import ( + "encoding/binary" + "testing" + + "github.com/ipfs/go-merkledag" + "github.com/multiformats/go-multicodec" + "github.com/stretchr/testify/require" +) + +func TestSortedIndexCodec(t *testing.T) { + require.Equal(t, multicodec.CarIndexSorted, newSorted().Codec()) +} + +func TestIndexSorted_GetReturnsNotFoundWhenCidDoesNotExist(t *testing.T) { + nonExistingKey := merkledag.NewRawNode([]byte("lobstermuncher")).Block.Cid() + tests := []struct { + name string + subject Index + }{ + { + "Sorted", + newSorted(), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotOffset, err := GetFirst(tt.subject, nonExistingKey) + require.Equal(t, ErrNotFound, err) + require.Equal(t, uint64(0), gotOffset) + }) + } +} + +func TestSingleWidthIndex_GetAll(t *testing.T) { + l := 4 + width := 9 + buf := make([]byte, width*l) + + // Populate the index bytes as total of four records. + // The last record should not match the getAll. + for i := 0; i < l; i++ { + if i < l-1 { + buf[i*width] = 1 + } else { + buf[i*width] = 2 + } + binary.LittleEndian.PutUint64(buf[(i*width)+1:(i*width)+width], uint64(14)) + } + subject := &singleWidthIndex{ + width: 9, + len: uint64(l), + index: buf, + } + + var foundCount int + err := subject.getAll([]byte{1}, func(u uint64) bool { + foundCount++ + return true + }) + require.NoError(t, err) + require.Equal(t, 3, foundCount) +} diff --git a/ipld/car/v2/index/mhindexsorted.go b/ipld/car/v2/index/mhindexsorted.go new file mode 100644 index 0000000000..598f1701fd --- /dev/null +++ b/ipld/car/v2/index/mhindexsorted.go @@ -0,0 +1,214 @@ +package index + +import ( + "encoding/binary" + "errors" + "io" + "sort" + + internalio "github.com/ipld/go-car/v2/internal/io" + + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" +) + +var ( + _ Index = (*MultihashIndexSorted)(nil) + _ IterableIndex = (*MultihashIndexSorted)(nil) +) + +type ( + // MultihashIndexSorted maps multihash code (i.e. hashing algorithm) to multiWidthCodedIndex. + MultihashIndexSorted map[uint64]*multiWidthCodedIndex + // multiWidthCodedIndex stores multihash code for each multiWidthIndex. + multiWidthCodedIndex struct { + multiWidthIndex + code uint64 + } +) + +func newMultiWidthCodedIndex() *multiWidthCodedIndex { + return &multiWidthCodedIndex{ + multiWidthIndex: make(multiWidthIndex), + } +} + +func (m *multiWidthCodedIndex) Marshal(w io.Writer) (uint64, error) { + if err := binary.Write(w, binary.LittleEndian, m.code); err != nil { + return 8, err + } + n, err := m.multiWidthIndex.Marshal(w) + return 8 + n, err +} + +func (m *multiWidthCodedIndex) Unmarshal(r io.Reader) error { + if err := binary.Read(r, binary.LittleEndian, &m.code); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + return m.multiWidthIndex.Unmarshal(r) +} + +func (m *multiWidthCodedIndex) forEach(f func(mh multihash.Multihash, offset uint64) error) error { + return m.multiWidthIndex.forEachDigest(func(digest []byte, offset uint64) error { + mh, err := multihash.Encode(digest, m.code) + if err != nil { + return err + } + return f(mh, offset) + }) +} + +func (m *MultihashIndexSorted) Codec() multicodec.Code { + return multicodec.CarMultihashIndexSorted +} + +func (m *MultihashIndexSorted) Marshal(w io.Writer) (uint64, error) { + if err := binary.Write(w, binary.LittleEndian, int32(len(*m))); err != nil { + return 4, err + } + // The codes are unique, but ranging over a map isn't deterministic. + // As per the CARv2 spec, we must order buckets by digest length. + // TODO update CARv2 spec to reflect this for the new index type. + codes := m.sortedMultihashCodes() + l := uint64(4) + + for _, code := range codes { + mwci := (*m)[code] + n, err := mwci.Marshal(w) + l += n + if err != nil { + return l, err + } + } + return l, nil +} + +func (m *MultihashIndexSorted) sortedMultihashCodes() []uint64 { + codes := make([]uint64, 0, len(*m)) + for code := range *m { + codes = append(codes, code) + } + sort.Slice(codes, func(i, j int) bool { + return codes[i] < codes[j] + }) + return codes +} + +func (m *MultihashIndexSorted) Unmarshal(r io.Reader) error { + reader := internalio.ToByteReadSeeker(r) + var l int32 + if err := binary.Read(reader, binary.LittleEndian, &l); err != nil { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err + } + sum, err := reader.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + if int32(l) < 0 { + return errors.New("index too big; MultihashIndexSorted count is overflowing int32") + } + for i := 0; i < int(l); i++ { + mwci := newMultiWidthCodedIndex() + if err := mwci.Unmarshal(r); err != nil { + return err + } + n, err := reader.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + oldSum := sum + sum += n + if sum < oldSum { + return errors.New("index too big; MultihashIndexSorted len is overflowing int64") + } + m.put(mwci) + } + return nil +} + +func (m *MultihashIndexSorted) put(mwci *multiWidthCodedIndex) { + (*m)[mwci.code] = mwci +} + +func (m *MultihashIndexSorted) Load(records []Record) error { + // TODO optimize load by avoiding multihash decoding twice. + // This implementation decodes multihashes twice: once here to group by code, and once in the + // internals of multiWidthIndex to group by digest length. The code can be optimized by + // combining the grouping logic into one step where the multihash of a CID is only decoded once. + // The optimization would need refactoring of the IndexSorted compaction logic. + + // Group records by multihash code + byCode := make(map[uint64][]Record) + for _, record := range records { + dmh, err := multihash.Decode(record.Hash()) + if err != nil { + return err + } + code := dmh.Code + recsByCode, ok := byCode[code] + if !ok { + recsByCode = make([]Record, 0) + byCode[code] = recsByCode + } + byCode[code] = append(recsByCode, record) + } + + // Load each record group. + for code, recsByCode := range byCode { + mwci := newMultiWidthCodedIndex() + mwci.code = code + if err := mwci.Load(recsByCode); err != nil { + return err + } + m.put(mwci) + } + return nil +} + +func (m *MultihashIndexSorted) GetAll(cid cid.Cid, f func(uint64) bool) error { + hash := cid.Hash() + dmh, err := multihash.Decode(hash) + if err != nil { + return err + } + mwci, err := m.get(dmh) + if err != nil { + return err + } + return mwci.GetAll(cid, f) +} + +// ForEach calls f for every multihash and its associated offset stored by this index. +func (m *MultihashIndexSorted) ForEach(f func(mh multihash.Multihash, offset uint64) error) error { + sizes := make([]uint64, 0, len(*m)) + for k := range *m { + sizes = append(sizes, k) + } + sort.Slice(sizes, func(i, j int) bool { return sizes[i] < sizes[j] }) + for _, s := range sizes { + mwci := (*m)[s] + if err := mwci.forEach(f); err != nil { + return err + } + } + return nil +} + +func (m *MultihashIndexSorted) get(dmh *multihash.DecodedMultihash) (*multiWidthCodedIndex, error) { + if codedIdx, ok := (*m)[dmh.Code]; ok { + return codedIdx, nil + } + return nil, ErrNotFound +} + +func NewMultihashSorted() *MultihashIndexSorted { + index := make(MultihashIndexSorted) + return &index +} diff --git a/ipld/car/v2/index/mhindexsorted_test.go b/ipld/car/v2/index/mhindexsorted_test.go new file mode 100644 index 0000000000..520157e447 --- /dev/null +++ b/ipld/car/v2/index/mhindexsorted_test.go @@ -0,0 +1,113 @@ +package index_test + +import ( + "bytes" + "fmt" + "math/rand" + "testing" + + "github.com/multiformats/go-multicodec" + + "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" + "github.com/multiformats/go-multihash" + "github.com/stretchr/testify/require" +) + +func TestMutilhashSortedIndex_Codec(t *testing.T) { + subject, err := index.New(multicodec.CarMultihashIndexSorted) + require.NoError(t, err) + require.Equal(t, multicodec.CarMultihashIndexSorted, subject.Codec()) +} + +func TestMultiWidthCodedIndex_MarshalUnmarshal(t *testing.T) { + rng := rand.New(rand.NewSource(1413)) + records := generateIndexRecords(t, multihash.SHA2_256, rng) + + // Create a new mh sorted index and load randomly generated records into it. + subject, err := index.New(multicodec.CarMultihashIndexSorted) + require.NoError(t, err) + err = subject.Load(records) + require.NoError(t, err) + + // Marshal the index. + buf := new(bytes.Buffer) + _, err = subject.Marshal(buf) + require.NoError(t, err) + + // Unmarshal it back to another instance of mh sorted index. + umSubject, err := index.New(multicodec.CarMultihashIndexSorted) + require.NoError(t, err) + err = umSubject.Unmarshal(buf) + require.NoError(t, err) + + // Assert original records are present in both index instances with expected offset. + requireContainsAll(t, subject, records) + requireContainsAll(t, umSubject, records) +} + +func TestMultiWidthCodedIndex_StableIterate(t *testing.T) { + rng := rand.New(rand.NewSource(1414)) + records := generateIndexRecords(t, multihash.SHA2_256, rng) + records = append(records, generateIndexRecords(t, multihash.SHA2_512, rng)...) + records = append(records, generateIndexRecords(t, multihash.IDENTITY, rng)...) + + // Create a new mh sorted index and load randomly generated records into it. + idx, err := index.New(multicodec.CarMultihashIndexSorted) + require.NoError(t, err) + err = idx.Load(records) + require.NoError(t, err) + + subject, ok := idx.(index.IterableIndex) + require.True(t, ok) + + mh := make([]multihash.Multihash, 0, len(records)) + require.NoError(t, subject.ForEach(func(m multihash.Multihash, _ uint64) error { + mh = append(mh, m) + return nil + })) + + for i := 0; i < 10; i++ { + candidate := make([]multihash.Multihash, 0, len(records)) + require.NoError(t, subject.ForEach(func(m multihash.Multihash, _ uint64) error { + candidate = append(candidate, m) + return nil + })) + require.Equal(t, mh, candidate) + } +} + +func generateIndexRecords(t *testing.T, hasherCode uint64, rng *rand.Rand) []index.Record { + var records []index.Record + recordCount := rng.Intn(99) + 1 // Up to 100 records + for i := 0; i < recordCount; i++ { + records = append(records, index.Record{ + Cid: generateCidV1(t, hasherCode, rng), + Offset: rng.Uint64(), + }) + } + return records +} + +func generateCidV1(t *testing.T, hasherCode uint64, rng *rand.Rand) cid.Cid { + data := []byte(fmt.Sprintf("🌊d-%d", rng.Uint64())) + mh, err := multihash.Sum(data, hasherCode, -1) + require.NoError(t, err) + return cid.NewCidV1(cid.Raw, mh) +} + +func requireContainsAll(t *testing.T, subject index.Index, nonIdentityRecords []index.Record) { + for _, r := range nonIdentityRecords { + wantCid := r.Cid + wantOffset := r.Offset + + var gotOffsets []uint64 + err := subject.GetAll(wantCid, func(o uint64) bool { + gotOffsets = append(gotOffsets, o) + return false + }) + require.NoError(t, err) + require.Equal(t, 1, len(gotOffsets)) + require.Equal(t, wantOffset, gotOffsets[0]) + } +} diff --git a/ipld/car/v2/index_gen.go b/ipld/car/v2/index_gen.go new file mode 100644 index 0000000000..092ed434c1 --- /dev/null +++ b/ipld/car/v2/index_gen.go @@ -0,0 +1,231 @@ +package car + +import ( + "fmt" + "io" + "os" + + "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" +) + +// GenerateIndex generates index for the given car payload reader. +// The index can be stored in serialized format using index.WriteTo. +// +// Note, the index is re-generated every time even if the payload is in CARv2 format and already has +// an index. To read existing index when available see ReadOrGenerateIndex. +// See: LoadIndex. +func GenerateIndex(v1r io.Reader, opts ...Option) (index.Index, error) { + wopts := ApplyOptions(opts...) + idx, err := index.New(wopts.IndexCodec) + if err != nil { + return nil, err + } + if err := LoadIndex(idx, v1r, opts...); err != nil { + return nil, err + } + return idx, nil +} + +// LoadIndex populates idx with index records generated from r. +// The r may be in CARv1 or CARv2 format. +// +// If the StoreIdentityCIDs option is set when calling LoadIndex, identity +// CIDs will be included in the index. By default this option is off, and +// identity CIDs will not be included in the index. +// +// Note, the index is re-generated every time even if r is in CARv2 format and already has an index. +// To read existing index when available see ReadOrGenerateIndex. +func LoadIndex(idx index.Index, r io.Reader, opts ...Option) error { + // Parse Options. + o := ApplyOptions(opts...) + + reader := internalio.ToByteReadSeeker(r) + pragma, err := carv1.ReadHeader(r, o.MaxAllowedHeaderSize) + if err != nil { + return fmt.Errorf("error reading car header: %w", err) + } + + var dataSize, dataOffset int64 + switch pragma.Version { + case 1: + break + case 2: + // Read V2 header which should appear immediately after pragma according to CARv2 spec. + var v2h Header + _, err := v2h.ReadFrom(r) + if err != nil { + return err + } + + // Sanity-check the CARv2 header + if v2h.DataOffset < HeaderSize { + return fmt.Errorf("malformed CARv2; data offset too small: %d", v2h.DataOffset) + } + if v2h.DataSize < 1 { + return fmt.Errorf("malformed CARv2; data payload size too small: %d", v2h.DataSize) + } + + // Seek to the beginning of the inner CARv1 payload + _, err = reader.Seek(int64(v2h.DataOffset), io.SeekStart) + if err != nil { + return err + } + + // Set dataSize and dataOffset which are then used during index loading logic to decide + // where to stop and adjust section offset respectively. + // Note that we could use a LimitReader here and re-define reader with it. However, it means + // the internalio.ToByteReadSeeker will be less efficient since LimitReader does not + // implement ByteReader nor ReadSeeker. + dataSize = int64(v2h.DataSize) + dataOffset = int64(v2h.DataOffset) + + // Read the inner CARv1 header to skip it and sanity check it. + v1h, err := carv1.ReadHeader(reader, o.MaxAllowedHeaderSize) + if err != nil { + return err + } + if v1h.Version != 1 { + return fmt.Errorf("expected data payload header version of 1; got %d", v1h.Version) + } + default: + return fmt.Errorf("expected either version 1 or 2; got %d", pragma.Version) + } + + // Record the start of each section, with first section starring from current position in the + // reader, i.e. right after the header, since we have only read the header so far. + var sectionOffset int64 + + // The Seek call below is equivalent to getting the reader.offset directly. + // We get it through Seek to only depend on APIs of a typical io.Seeker. + // This would also reduce refactoring in case the utility reader is moved. + if sectionOffset, err = reader.Seek(0, io.SeekCurrent); err != nil { + return err + } + + // Subtract the data offset; if CARv1 this would be zero otherwise the value will come from the + // CARv2 header. + sectionOffset -= dataOffset + + records := make([]index.Record, 0) + for { + // Read the section's length. + sectionLen, err := varint.ReadUvarint(reader) + if err != nil { + if err == io.EOF { + break + } + return err + } + + // Null padding; by default it's an error. + if sectionLen == 0 { + if o.ZeroLengthSectionAsEOF { + break + } else { + return fmt.Errorf("carv1 null padding not allowed by default; see ZeroLengthSectionAsEOF") + } + } + + // Read the CID. + cidLen, c, err := cid.CidFromReader(reader) + if err != nil { + return err + } + + if o.StoreIdentityCIDs || c.Prefix().MhType != multihash.IDENTITY { + if uint64(cidLen) > o.MaxIndexCidSize { + return &ErrCidTooLarge{MaxSize: o.MaxIndexCidSize, CurrentSize: uint64(cidLen)} + } + records = append(records, index.Record{Cid: c, Offset: uint64(sectionOffset)}) + } + + // Seek to the next section by skipping the block. + // The section length includes the CID, so subtract it. + remainingSectionLen := int64(sectionLen) - int64(cidLen) + if sectionOffset, err = reader.Seek(remainingSectionLen, io.SeekCurrent); err != nil { + return err + } + // Subtract the data offset which will be non-zero when reader represents a CARv2. + sectionOffset -= dataOffset + + // Check if we have reached the end of data payload and if so treat it as an EOF. + // Note, dataSize will be non-zero only if we are reading from a CARv2. + if dataSize != 0 && sectionOffset >= dataSize { + break + } + } + + if err := idx.Load(records); err != nil { + return err + } + + return nil +} + +// GenerateIndexFromFile walks a CAR file at the give path and generates an index of cid->byte offset. +// The index can be stored using index.WriteTo. Both CARv1 and CARv2 formats are accepted. +// +// Note, the index is re-generated every time even if the given CAR file is in CARv2 format and +// already has an index. To read existing index when available see ReadOrGenerateIndex. +// +// See: GenerateIndex. +func GenerateIndexFromFile(path string, opts ...Option) (index.Index, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + return GenerateIndex(f, opts...) +} + +// ReadOrGenerateIndex accepts both CARv1 and CARv2 formats, and reads or generates an index for it. +// When the given reader is in CARv1 format an index is always generated. +// For a payload in CARv2 format, an index is only generated if Header.HasIndex returns false. +// An error is returned for all other formats, i.e. pragma with versions other than 1 or 2. +// +// Note, the returned index lives entirely in memory and will not depend on the +// given reader to fulfill index lookup. +func ReadOrGenerateIndex(rs io.ReadSeeker, opts ...Option) (index.Index, error) { + // Read version. + version, err := ReadVersion(rs, opts...) + if err != nil { + return nil, err + } + // Seek to the beginning, since reading the version changes the reader's offset. + if _, err := rs.Seek(0, io.SeekStart); err != nil { + return nil, err + } + + switch version { + case 1: + // Simply generate the index, since there can't be a pre-existing one. + return GenerateIndex(rs, opts...) + case 2: + // Read CARv2 format + v2r, err := NewReader(internalio.ToReaderAt(rs), opts...) + if err != nil { + return nil, err + } + // If index is present, then no need to generate; decode and return it. + if v2r.Header.HasIndex() { + ir, err := v2r.IndexReader() + if err != nil { + return nil, err + } + return index.ReadFrom(ir) + } + // Otherwise, generate index from CARv1 payload wrapped within CARv2 format. + dr, err := v2r.DataReader() + if err != nil { + return nil, err + } + return GenerateIndex(dr, opts...) + default: + return nil, fmt.Errorf("unknown version %v", version) + } +} diff --git a/ipld/car/v2/index_gen_test.go b/ipld/car/v2/index_gen_test.go new file mode 100644 index 0000000000..61fd055da6 --- /dev/null +++ b/ipld/car/v2/index_gen_test.go @@ -0,0 +1,274 @@ +package car_test + +import ( + "io" + "os" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" + "github.com/stretchr/testify/require" +) + +func TestGenerateIndex(t *testing.T) { + type testCase struct { + name string + carPath string + opts []carv2.Option + wantIndexer func(t *testing.T) index.Index + wantErr bool + } + tests := []testCase{ + { + name: "CarV1IsIndexedAsExpected", + carPath: "testdata/sample-v1.car", + wantIndexer: func(t *testing.T) index.Index { + v1, err := os.Open("testdata/sample-v1.car") + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, v1.Close()) }) + want, err := carv2.GenerateIndex(v1) + require.NoError(t, err) + return want + }, + }, + { + name: "CarV2WithIndexIsReturnedAsExpected", + carPath: "testdata/sample-wrapped-v2.car", + wantIndexer: func(t *testing.T) index.Index { + v2, err := os.Open("testdata/sample-wrapped-v2.car") + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, v2.Close()) }) + reader, err := carv2.NewReader(v2) + require.NoError(t, err) + ir, err := reader.IndexReader() + require.NoError(t, err) + want, err := index.ReadFrom(ir) + require.NoError(t, err) + return want + }, + }, + { + name: "CarV1WithZeroLenSectionIsGeneratedAsExpected", + carPath: "testdata/sample-v1-with-zero-len-section.car", + opts: []carv2.Option{carv2.ZeroLengthSectionAsEOF(true)}, + wantIndexer: func(t *testing.T) index.Index { + v1, err := os.Open("testdata/sample-v1-with-zero-len-section.car") + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, v1.Close()) }) + want, err := carv2.GenerateIndex(v1, carv2.ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + return want + }, + }, + { + name: "AnotherCarV1WithZeroLenSectionIsGeneratedAsExpected", + carPath: "testdata/sample-v1-with-zero-len-section2.car", + opts: []carv2.Option{carv2.ZeroLengthSectionAsEOF(true)}, + wantIndexer: func(t *testing.T) index.Index { + v1, err := os.Open("testdata/sample-v1-with-zero-len-section2.car") + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, v1.Close()) }) + want, err := carv2.GenerateIndex(v1, carv2.ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + return want + }, + }, + { + name: "CarV1WithZeroLenSectionWithoutOptionIsError", + carPath: "testdata/sample-v1-with-zero-len-section.car", + wantErr: true, + }, + { + name: "CarOtherThanV1OrV2IsError", + carPath: "testdata/sample-rootless-v42.car", + wantIndexer: func(t *testing.T) index.Index { return nil }, + wantErr: true, + }, + } + + requireWant := func(tt testCase, got index.Index, gotErr error) { + if tt.wantErr { + require.Error(t, gotErr) + } else { + require.NoError(t, gotErr) + var want index.Index + if tt.wantIndexer != nil { + want = tt.wantIndexer(t) + } + if want == nil { + require.Nil(t, got) + } else { + require.Equal(t, want, got) + } + } + } + + for _, tt := range tests { + t.Run("ReadOrGenerateIndex_"+tt.name, func(t *testing.T) { + carFile, err := os.Open(tt.carPath) + require.NoError(t, err) + t.Cleanup(func() { assert.NoError(t, carFile.Close()) }) + got, gotErr := carv2.ReadOrGenerateIndex(carFile, tt.opts...) + requireWant(tt, got, gotErr) + }) + t.Run("GenerateIndexFromFile_"+tt.name, func(t *testing.T) { + got, gotErr := carv2.GenerateIndexFromFile(tt.carPath, tt.opts...) + requireWant(tt, got, gotErr) + }) + t.Run("LoadIndex_"+tt.name, func(t *testing.T) { + carFile, err := os.Open(tt.carPath) + require.NoError(t, err) + got, err := index.New(multicodec.CarMultihashIndexSorted) + require.NoError(t, err) + gotErr := carv2.LoadIndex(got, carFile, tt.opts...) + requireWant(tt, got, gotErr) + }) + t.Run("GenerateIndex_"+tt.name, func(t *testing.T) { + carFile, err := os.Open(tt.carPath) + require.NoError(t, err) + got, gotErr := carv2.GenerateIndex(carFile, tt.opts...) + requireWant(tt, got, gotErr) + }) + } +} + +func TestMultihashIndexSortedConsistencyWithIndexSorted(t *testing.T) { + path := "testdata/sample-v1.car" + + sortedIndex, err := carv2.GenerateIndexFromFile(path) + require.NoError(t, err) + require.Equal(t, multicodec.CarMultihashIndexSorted, sortedIndex.Codec()) + + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + br, err := carv2.NewBlockReader(f) + require.NoError(t, err) + + subject := generateMultihashSortedIndex(t, path) + for { + wantNext, err := br.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + + dmh, err := multihash.Decode(wantNext.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + continue + } + + wantCid := wantNext.Cid() + var wantOffsets []uint64 + err = sortedIndex.GetAll(wantCid, func(o uint64) bool { + wantOffsets = append(wantOffsets, o) + return false + }) + require.NoError(t, err) + + var gotOffsets []uint64 + err = subject.GetAll(wantCid, func(o uint64) bool { + gotOffsets = append(gotOffsets, o) + return false + }) + + require.NoError(t, err) + require.Equal(t, wantOffsets, gotOffsets) + } +} + +func TestMultihashSorted_ForEachIsConsistentWithGetAll(t *testing.T) { + path := "testdata/sample-v1.car" + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + + br, err := carv2.NewBlockReader(f) + require.NoError(t, err) + subject := generateMultihashSortedIndex(t, path) + + gotForEach := make(map[string]uint64) + err = subject.ForEach(func(mh multihash.Multihash, offset uint64) error { + gotForEach[mh.String()] = offset + return nil + }) + require.NoError(t, err) + + for { + b, err := br.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + c := b.Cid() + dmh, err := multihash.Decode(c.Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + continue + } + + wantMh := c.Hash() + + var wantOffset uint64 + err = subject.GetAll(c, func(u uint64) bool { + wantOffset = u + return false + }) + require.NoError(t, err) + + s := wantMh.String() + gotOffset, ok := gotForEach[s] + require.True(t, ok) + require.Equal(t, wantOffset, gotOffset) + } +} + +func generateMultihashSortedIndex(t *testing.T, path string) *index.MultihashIndexSorted { + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + reader := internalio.ToByteReadSeeker(f) + header, err := carv1.ReadHeader(reader, carv1.DefaultMaxAllowedHeaderSize) + require.NoError(t, err) + require.Equal(t, uint64(1), header.Version) + + idx := index.NewMultihashSorted() + records := make([]index.Record, 0) + + var sectionOffset int64 + sectionOffset, err = reader.Seek(0, io.SeekCurrent) + require.NoError(t, err) + + for { + sectionLen, err := varint.ReadUvarint(reader) + if err == io.EOF { + break + } + require.NoError(t, err) + + if sectionLen == 0 { + break + } + + cidLen, c, err := cid.CidFromReader(reader) + require.NoError(t, err) + records = append(records, index.Record{Cid: c, Offset: uint64(sectionOffset)}) + remainingSectionLen := int64(sectionLen) - int64(cidLen) + sectionOffset, err = reader.Seek(remainingSectionLen, io.SeekCurrent) + require.NoError(t, err) + } + + err = idx.Load(records) + require.NoError(t, err) + + return idx +} diff --git a/ipld/car/v2/internal/carv1/car.go b/ipld/car/v2/internal/carv1/car.go new file mode 100644 index 0000000000..a38744a3ca --- /dev/null +++ b/ipld/car/v2/internal/carv1/car.go @@ -0,0 +1,284 @@ +package carv1 + +import ( + "context" + "fmt" + "io" + + "github.com/ipld/go-car/v2/internal/carv1/util" + + cid "github.com/ipfs/go-cid" + cbor "github.com/ipfs/go-ipld-cbor" + format "github.com/ipfs/go-ipld-format" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-merkledag" + internalio "github.com/ipld/go-car/v2/internal/io" +) + +const DefaultMaxAllowedHeaderSize uint64 = 32 << 20 // 32MiB +const DefaultMaxAllowedSectionSize uint64 = 8 << 20 // 8MiB + +func init() { + cbor.RegisterCborType(CarHeader{}) +} + +type Store interface { + Put(context.Context, blocks.Block) error +} + +type ReadStore interface { + Get(context.Context, cid.Cid) (blocks.Block, error) +} + +type CarHeader struct { + Roots []cid.Cid + Version uint64 +} + +type carWriter struct { + ds format.NodeGetter + w io.Writer +} + +func WriteCar(ctx context.Context, ds format.NodeGetter, roots []cid.Cid, w io.Writer) error { + h := &CarHeader{ + Roots: roots, + Version: 1, + } + + if err := WriteHeader(h, w); err != nil { + return fmt.Errorf("failed to write car header: %s", err) + } + + cw := &carWriter{ds: ds, w: w} + seen := cid.NewSet() + for _, r := range roots { + if err := merkledag.Walk(ctx, cw.enumGetLinks, r, seen.Visit); err != nil { + return err + } + } + return nil +} + +func ReadHeaderAt(at io.ReaderAt, maxReadBytes uint64) (*CarHeader, error) { + var rr io.Reader + switch r := at.(type) { + case io.Reader: + rr = r + default: + var err error + rr, err = internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return nil, err + } + } + return ReadHeader(rr, maxReadBytes) +} + +func ReadHeader(r io.Reader, maxReadBytes uint64) (*CarHeader, error) { + hb, err := util.LdRead(r, false, maxReadBytes) + if err != nil { + if err == util.ErrSectionTooLarge { + err = util.ErrHeaderTooLarge + } + return nil, err + } + + var ch CarHeader + if err := cbor.DecodeInto(hb, &ch); err != nil { + return nil, fmt.Errorf("invalid header: %v", err) + } + + return &ch, nil +} + +func WriteHeader(h *CarHeader, w io.Writer) error { + hb, err := cbor.DumpObject(h) + if err != nil { + return err + } + + return util.LdWrite(w, hb) +} + +func HeaderSize(h *CarHeader) (uint64, error) { + hb, err := cbor.DumpObject(h) + if err != nil { + return 0, err + } + + return util.LdSize(hb), nil +} + +func (cw *carWriter) enumGetLinks(ctx context.Context, c cid.Cid) ([]*format.Link, error) { + nd, err := cw.ds.Get(ctx, c) + if err != nil { + return nil, err + } + + if err := cw.writeNode(ctx, nd); err != nil { + return nil, err + } + + return nd.Links(), nil +} + +func (cw *carWriter) writeNode(ctx context.Context, nd format.Node) error { + return util.LdWrite(cw.w, nd.Cid().Bytes(), nd.RawData()) +} + +type CarReader struct { + r io.Reader + Header *CarHeader + zeroLenAsEOF bool + maxAllowedSectionSize uint64 +} + +func NewCarReaderWithZeroLengthSectionAsEOF(r io.Reader) (*CarReader, error) { + return NewCarReaderWithoutDefaults(r, true, DefaultMaxAllowedHeaderSize, DefaultMaxAllowedSectionSize) +} + +func NewCarReader(r io.Reader) (*CarReader, error) { + return NewCarReaderWithoutDefaults(r, false, DefaultMaxAllowedHeaderSize, DefaultMaxAllowedSectionSize) +} + +func NewCarReaderWithoutDefaults(r io.Reader, zeroLenAsEOF bool, maxAllowedHeaderSize uint64, maxAllowedSectionSize uint64) (*CarReader, error) { + ch, err := ReadHeader(r, maxAllowedHeaderSize) + if err != nil { + return nil, err + } + + if ch.Version != 1 { + return nil, fmt.Errorf("invalid car version: %d", ch.Version) + } + + if len(ch.Roots) == 0 { + return nil, fmt.Errorf("empty car, no roots") + } + + return &CarReader{ + r: r, + Header: ch, + zeroLenAsEOF: zeroLenAsEOF, + maxAllowedSectionSize: maxAllowedSectionSize, + }, nil +} + +func (cr *CarReader) Next() (blocks.Block, error) { + c, data, err := util.ReadNode(cr.r, cr.zeroLenAsEOF, cr.maxAllowedSectionSize) + if err != nil { + return nil, err + } + + hashed, err := c.Prefix().Sum(data) + if err != nil { + return nil, err + } + + if !hashed.Equals(c) { + return nil, fmt.Errorf("mismatch in content integrity, name: %s, data: %s", c, hashed) + } + + return blocks.NewBlockWithCid(data, c) +} + +type batchStore interface { + PutMany(context.Context, []blocks.Block) error +} + +func LoadCar(s Store, r io.Reader) (*CarHeader, error) { + ctx := context.TODO() + cr, err := NewCarReader(r) + if err != nil { + return nil, err + } + + if bs, ok := s.(batchStore); ok { + return loadCarFast(ctx, bs, cr) + } + + return loadCarSlow(ctx, s, cr) +} + +func loadCarFast(ctx context.Context, s batchStore, cr *CarReader) (*CarHeader, error) { + var buf []blocks.Block + for { + blk, err := cr.Next() + if err != nil { + if err == io.EOF { + if len(buf) > 0 { + if err := s.PutMany(ctx, buf); err != nil { + return nil, err + } + } + return cr.Header, nil + } + return nil, err + } + + buf = append(buf, blk) + + if len(buf) > 1000 { + if err := s.PutMany(ctx, buf); err != nil { + return nil, err + } + buf = buf[:0] + } + } +} + +func loadCarSlow(ctx context.Context, s Store, cr *CarReader) (*CarHeader, error) { + for { + blk, err := cr.Next() + if err != nil { + if err == io.EOF { + return cr.Header, nil + } + return nil, err + } + + if err := s.Put(ctx, blk); err != nil { + return nil, err + } + } +} + +// Matches checks whether two headers match. +// Two headers are considered matching if: +// 1. They have the same version number, and +// 2. They contain the same root CIDs in any order. +// +// Note, this function explicitly ignores the order of roots. +// If order of roots matter use reflect.DeepEqual instead. +func (h CarHeader) Matches(other CarHeader) bool { + if h.Version != other.Version { + return false + } + thisLen := len(h.Roots) + if thisLen != len(other.Roots) { + return false + } + // Headers with a single root are popular. + // Implement a fast execution path for popular cases. + if thisLen == 1 { + return h.Roots[0].Equals(other.Roots[0]) + } + + // Check other contains all roots. + // TODO: should this be optimised for cases where the number of roots are large since it has O(N^2) complexity? + for _, r := range h.Roots { + if !other.containsRoot(r) { + return false + } + } + return true +} + +func (h *CarHeader) containsRoot(root cid.Cid) bool { + for _, r := range h.Roots { + if r.Equals(root) { + return true + } + } + return false +} diff --git a/ipld/car/v2/internal/carv1/car_test.go b/ipld/car/v2/internal/carv1/car_test.go new file mode 100644 index 0000000000..bdd57573f9 --- /dev/null +++ b/ipld/car/v2/internal/carv1/car_test.go @@ -0,0 +1,333 @@ +package carv1 + +import ( + "bytes" + "context" + "encoding/hex" + "io" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/require" + + cid "github.com/ipfs/go-cid" + format "github.com/ipfs/go-ipld-format" + "github.com/ipfs/go-merkledag" + dstest "github.com/ipfs/go-merkledag/test" +) + +func assertAddNodes(t *testing.T, ds format.DAGService, nds ...format.Node) { + for _, nd := range nds { + if err := ds.Add(context.Background(), nd); err != nil { + t.Fatal(err) + } + } +} + +func TestRoundtrip(t *testing.T) { + dserv := dstest.Mock() + a := merkledag.NewRawNode([]byte("aaaa")) + b := merkledag.NewRawNode([]byte("bbbb")) + c := merkledag.NewRawNode([]byte("cccc")) + + nd1 := &merkledag.ProtoNode{} + nd1.AddNodeLink("cat", a) + + nd2 := &merkledag.ProtoNode{} + nd2.AddNodeLink("first", nd1) + nd2.AddNodeLink("dog", b) + + nd3 := &merkledag.ProtoNode{} + nd3.AddNodeLink("second", nd2) + nd3.AddNodeLink("bear", c) + + assertAddNodes(t, dserv, a, b, c, nd1, nd2, nd3) + + buf := new(bytes.Buffer) + if err := WriteCar(context.Background(), dserv, []cid.Cid{nd3.Cid()}, buf); err != nil { + t.Fatal(err) + } + + bserv := dstest.Bserv() + ch, err := LoadCar(bserv.Blockstore(), buf) + if err != nil { + t.Fatal(err) + } + + if len(ch.Roots) != 1 { + t.Fatal("should have one root") + } + + if !ch.Roots[0].Equals(nd3.Cid()) { + t.Fatal("got wrong cid") + } + + bs := bserv.Blockstore() + for _, nd := range []format.Node{a, b, c, nd1, nd2, nd3} { + has, err := bs.Has(context.TODO(), nd.Cid()) + if err != nil { + t.Fatal(err) + } + + if !has { + t.Fatal("should have cid in blockstore") + } + } +} + +func TestEOFHandling(t *testing.T) { + // fixture is a clean single-block, single-root CAR + fixture, err := hex.DecodeString("3aa265726f6f747381d82a58250001711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e012c01711220151fe9e73c6267a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80ba165646f646779f5") + if err != nil { + t.Fatal(err) + } + + load := func(t *testing.T, byts []byte) *CarReader { + cr, err := NewCarReader(bytes.NewReader(byts)) + if err != nil { + t.Fatal(err) + } + + blk, err := cr.Next() + if err != nil { + t.Fatal(err) + } + if blk.Cid().String() != "bafyreiavd7u6opdcm6tqmddpnrgmvfb4enxuwglhenejmchnwqvixd5ibm" { + t.Fatal("unexpected CID") + } + + return cr + } + + t.Run("CleanEOF", func(t *testing.T) { + cr := load(t, fixture) + + blk, err := cr.Next() + if err != io.EOF { + t.Fatal("Didn't get expected EOF") + } + if blk != nil { + t.Fatal("EOF returned expected block") + } + }) + + t.Run("BadVarint", func(t *testing.T) { + fixtureBadVarint := append(fixture, 160) + cr := load(t, fixtureBadVarint) + + blk, err := cr.Next() + if err != io.ErrUnexpectedEOF { + t.Fatal("Didn't get unexpected EOF") + } + if blk != nil { + t.Fatal("EOF returned unexpected block") + } + }) + + t.Run("TruncatedBlock", func(t *testing.T) { + fixtureTruncatedBlock := append(fixture, 100, 0, 0) + cr := load(t, fixtureTruncatedBlock) + + blk, err := cr.Next() + if err != io.ErrUnexpectedEOF { + t.Fatal("Didn't get unexpected EOF") + } + if blk != nil { + t.Fatal("EOF returned unexpected block") + } + }) +} + +func TestBadHeaders(t *testing.T) { + testCases := []struct { + name string + hex string + errStr string // either the whole error string + errPfx string // or just the prefix + }{ + { + "{version:2}", + "0aa16776657273696f6e02", + "invalid car version: 2", + "", + }, + { + // an unfortunate error because we don't use a pointer + "{roots:[baeaaaa3bmjrq]}", + "13a165726f6f747381d82a480001000003616263", + "invalid car version: 0", + "", + }, + { + "{version:\"1\",roots:[baeaaaa3bmjrq]}", + "1da265726f6f747381d82a4800010000036162636776657273696f6e6131", + "", "invalid header: ", + }, + { + "{version:1}", + "0aa16776657273696f6e01", + "empty car, no roots", + "", + }, + { + "{version:1,roots:{cid:baeaaaa3bmjrq}}", + "20a265726f6f7473a163636964d82a4800010000036162636776657273696f6e01", + "", + "invalid header: ", + }, + { + "{version:1,roots:[baeaaaa3bmjrq],blip:true}", + "22a364626c6970f565726f6f747381d82a4800010000036162636776657273696f6e01", + "", + "invalid header: ", + }, + { + "[1,[]]", + "03820180", + "", + "invalid header: ", + }, + { + // this is an unfortunate error, it'd be nice to catch it better but it's + // very unlikely we'd ever see this in practice + "null", + "01f6", + "", + "invalid car version: 0", + }, + } + + makeCar := func(t *testing.T, byts string) error { + fixture, err := hex.DecodeString(byts) + if err != nil { + t.Fatal(err) + } + _, err = NewCarReader(bytes.NewReader(fixture)) + return err + } + + t.Run("Sanity check {version:1,roots:[baeaaaa3bmjrq]}", func(t *testing.T) { + err := makeCar(t, "1ca265726f6f747381d82a4800010000036162636776657273696f6e01") + if err != nil { + t.Fatal(err) + } + }) + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + err := makeCar(t, tc.hex) + if err == nil { + t.Fatal("expected error from bad header, didn't get one") + } + if tc.errStr != "" { + if err.Error() != tc.errStr { + t.Fatalf("bad error: %v", err) + } + } else { + if !strings.HasPrefix(err.Error(), tc.errPfx) { + t.Fatalf("bad error: %v", err) + } + } + }) + } +} + +func TestCarHeaderMatchess(t *testing.T) { + oneCid := merkledag.NewRawNode([]byte("fish")).Cid() + anotherCid := merkledag.NewRawNode([]byte("lobster")).Cid() + tests := []struct { + name string + one CarHeader + other CarHeader + want bool + }{ + { + "SameVersionNilRootsIsMatching", + CarHeader{nil, 1}, + CarHeader{nil, 1}, + true, + }, + { + "SameVersionEmptyRootsIsMatching", + CarHeader{[]cid.Cid{}, 1}, + CarHeader{[]cid.Cid{}, 1}, + true, + }, + { + "SameVersionNonEmptySameRootsIsMatching", + CarHeader{[]cid.Cid{oneCid}, 1}, + CarHeader{[]cid.Cid{oneCid}, 1}, + true, + }, + { + "SameVersionNonEmptySameRootsInDifferentOrderIsMatching", + CarHeader{[]cid.Cid{oneCid, anotherCid}, 1}, + CarHeader{[]cid.Cid{anotherCid, oneCid}, 1}, + true, + }, + { + "SameVersionDifferentRootsIsNotMatching", + CarHeader{[]cid.Cid{oneCid}, 1}, + CarHeader{[]cid.Cid{anotherCid}, 1}, + false, + }, + { + "DifferentVersionDifferentRootsIsNotMatching", + CarHeader{[]cid.Cid{oneCid}, 0}, + CarHeader{[]cid.Cid{anotherCid}, 1}, + false, + }, + { + "MismatchingVersionIsNotMatching", + CarHeader{nil, 0}, + CarHeader{nil, 1}, + false, + }, + { + "ZeroValueHeadersAreMatching", + CarHeader{}, + CarHeader{}, + true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.one.Matches(tt.other) + require.Equal(t, tt.want, got, "Matches() = %v, want %v", got, tt.want) + }) + } +} + +func TestReadingZeroLengthSectionWithoutOptionSetIsError(t *testing.T) { + f, err := os.Open("../../testdata/sample-v1-with-zero-len-section.car") + require.NoError(t, err) + subject, err := NewCarReader(f) + require.NoError(t, err) + + for { + _, err := subject.Next() + if err == io.EOF { + break + } else if err != nil { + require.EqualError(t, err, "varints malformed, could not reach the end") + return + } + } + require.Fail(t, "expected error when reading file with zero section without option set") +} + +func TestReadingZeroLengthSectionWithOptionSetIsSuccess(t *testing.T) { + f, err := os.Open("../../testdata/sample-v1-with-zero-len-section.car") + require.NoError(t, err) + subject, err := NewCarReaderWithZeroLengthSectionAsEOF(f) + require.NoError(t, err) + + for { + _, err := subject.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + } +} diff --git a/ipld/car/v2/internal/carv1/doc.go b/ipld/car/v2/internal/carv1/doc.go new file mode 100644 index 0000000000..821ca2f0aa --- /dev/null +++ b/ipld/car/v2/internal/carv1/doc.go @@ -0,0 +1,2 @@ +// Forked from CARv1 to avoid dependency to ipld-prime 0.9.0 due to outstanding upgrades in filecoin. +package carv1 diff --git a/ipld/car/v2/internal/carv1/util/util.go b/ipld/car/v2/internal/carv1/util/util.go new file mode 100644 index 0000000000..00cd885b27 --- /dev/null +++ b/ipld/car/v2/internal/carv1/util/util.go @@ -0,0 +1,98 @@ +package util + +import ( + "errors" + "io" + + internalio "github.com/ipld/go-car/v2/internal/io" + + "github.com/multiformats/go-varint" + + cid "github.com/ipfs/go-cid" +) + +var ErrSectionTooLarge = errors.New("invalid section data, length of read beyond allowable maximum") +var ErrHeaderTooLarge = errors.New("invalid header data, length of read beyond allowable maximum") + +type BytesReader interface { + io.Reader + io.ByteReader +} + +func ReadNode(r io.Reader, zeroLenAsEOF bool, maxReadBytes uint64) (cid.Cid, []byte, error) { + data, err := LdRead(r, zeroLenAsEOF, maxReadBytes) + if err != nil { + return cid.Cid{}, nil, err + } + + n, c, err := cid.CidFromBytes(data) + if err != nil { + return cid.Cid{}, nil, err + } + + return c, data[n:], nil +} + +func LdWrite(w io.Writer, d ...[]byte) error { + var sum uint64 + for _, s := range d { + sum += uint64(len(s)) + } + + buf := make([]byte, 8) + n := varint.PutUvarint(buf, sum) + _, err := w.Write(buf[:n]) + if err != nil { + return err + } + + for _, s := range d { + _, err = w.Write(s) + if err != nil { + return err + } + } + + return nil +} + +func LdSize(d ...[]byte) uint64 { + var sum uint64 + for _, s := range d { + sum += uint64(len(s)) + } + s := varint.UvarintSize(sum) + return sum + uint64(s) +} + +func LdReadSize(r io.Reader, zeroLenAsEOF bool, maxReadBytes uint64) (uint64, error) { + l, err := varint.ReadUvarint(internalio.ToByteReader(r)) + if err != nil { + // If the length of bytes read is non-zero when the error is EOF then signal an unclean EOF. + if l > 0 && err == io.EOF { + return 0, io.ErrUnexpectedEOF + } + return 0, err + } else if l == 0 && zeroLenAsEOF { + return 0, io.EOF + } + + if l > maxReadBytes { // Don't OOM + return 0, ErrSectionTooLarge + } + return l, nil +} + +func LdRead(r io.Reader, zeroLenAsEOF bool, maxReadBytes uint64) ([]byte, error) { + l, err := LdReadSize(r, zeroLenAsEOF, maxReadBytes) + if err != nil { + return nil, err + } + + buf := make([]byte, l) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, err + } + + return buf, nil +} diff --git a/ipld/car/v2/internal/carv1/util/util_test.go b/ipld/car/v2/internal/carv1/util/util_test.go new file mode 100644 index 0000000000..27719183a8 --- /dev/null +++ b/ipld/car/v2/internal/carv1/util/util_test.go @@ -0,0 +1,28 @@ +package util_test + +import ( + "bytes" + crand "crypto/rand" + "math/rand" + "testing" + + "github.com/ipld/go-car/v2/internal/carv1/util" + + "github.com/stretchr/testify/require" +) + +func TestLdSize(t *testing.T) { + for i := 0; i < 5; i++ { + var buf bytes.Buffer + data := make([][]byte, 5) + for j := 0; j < 5; j++ { + data[j] = make([]byte, rand.Intn(30)) + _, err := crand.Read(data[j]) + require.NoError(t, err) + } + size := util.LdSize(data...) + err := util.LdWrite(&buf, data...) + require.NoError(t, err) + require.Equal(t, uint64(len(buf.Bytes())), size) + } +} diff --git a/ipld/car/v2/internal/errsort/search.go b/ipld/car/v2/internal/errsort/search.go new file mode 100644 index 0000000000..fb88617940 --- /dev/null +++ b/ipld/car/v2/internal/errsort/search.go @@ -0,0 +1,54 @@ +/* +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +package errsort + +// Search is like sort.Search but accepts an erroring closure. +// If it errors the search is terminated immediately +func Search(n int, f func(int) (bool, error)) (int, error) { + // Define f(-1) == false and f(n) == true. + // Invariant: f(i-1) == false, f(j) == true. + i, j := 0, n + for i < j { + h := int(uint(i+j) >> 1) // avoid overflow when computing h + // i ≤ h < j + less, err := f(h) + if err != nil { + return 0, err + } + if !less { + i = h + 1 // preserves f(i-1) == false + } else { + j = h // preserves f(j) == true + } + } + // i == j, f(i-1) == false, and f(j) (= f(i)) == true => answer is i. + return i, nil +} diff --git a/ipld/car/v2/internal/io/converter.go b/ipld/car/v2/internal/io/converter.go new file mode 100644 index 0000000000..2e2844bd0e --- /dev/null +++ b/ipld/car/v2/internal/io/converter.go @@ -0,0 +1,155 @@ +package io + +import ( + "errors" + "io" + "sync" +) + +var ( + _ io.ByteReader = (*readerPlusByte)(nil) + _ io.ByteReader = (*readSeekerPlusByte)(nil) + _ io.ByteReader = (*discardingReadSeekerPlusByte)(nil) + _ io.ReadSeeker = (*discardingReadSeekerPlusByte)(nil) + _ io.ReadSeeker = (*readerAtSeeker)(nil) + _ io.ReaderAt = (*readSeekerAt)(nil) +) + +type ( + readerPlusByte struct { + io.Reader + + byteBuf [1]byte // escapes via io.Reader.Read; preallocate + } + + readSeekerPlusByte struct { + io.ReadSeeker + + byteBuf [1]byte // escapes via io.Reader.Read; preallocate + } + + discardingReadSeekerPlusByte struct { + io.Reader + offset int64 + + byteBuf [1]byte // escapes via io.Reader.Read; preallocate + } + + ByteReadSeeker interface { + io.ReadSeeker + io.ByteReader + } + + readSeekerAt struct { + rs io.ReadSeeker + mu sync.Mutex + } + + readerAtSeeker struct { + ra io.ReaderAt + position int64 + mu sync.Mutex + } +) + +func ToByteReader(r io.Reader) io.ByteReader { + if br, ok := r.(io.ByteReader); ok { + return br + } + return &readerPlusByte{Reader: r} +} + +func ToByteReadSeeker(r io.Reader) ByteReadSeeker { + if brs, ok := r.(ByteReadSeeker); ok { + return brs + } + if rs, ok := r.(io.ReadSeeker); ok { + return &readSeekerPlusByte{ReadSeeker: rs} + } + return &discardingReadSeekerPlusByte{Reader: r} +} + +func ToReadSeeker(ra io.ReaderAt) io.ReadSeeker { + if rs, ok := ra.(io.ReadSeeker); ok { + return rs + } + return &readerAtSeeker{ra: ra} +} + +func ToReaderAt(rs io.ReadSeeker) io.ReaderAt { + if ra, ok := rs.(io.ReaderAt); ok { + return ra + } + return &readSeekerAt{rs: rs} +} + +func (rb *readerPlusByte) ReadByte() (byte, error) { + _, err := io.ReadFull(rb, rb.byteBuf[:]) + return rb.byteBuf[0], err +} + +func (rsb *readSeekerPlusByte) ReadByte() (byte, error) { + _, err := io.ReadFull(rsb, rsb.byteBuf[:]) + return rsb.byteBuf[0], err +} + +func (drsb *discardingReadSeekerPlusByte) ReadByte() (byte, error) { + _, err := io.ReadFull(drsb, drsb.byteBuf[:]) + return drsb.byteBuf[0], err +} + +func (drsb *discardingReadSeekerPlusByte) Read(p []byte) (read int, err error) { + read, err = drsb.Reader.Read(p) + drsb.offset += int64(read) + return +} + +func (drsb *discardingReadSeekerPlusByte) Seek(offset int64, whence int) (int64, error) { + switch whence { + case io.SeekStart: + n := offset - drsb.offset + if n < 0 { + return 0, errors.New("unsupported rewind via whence: io.SeekStart") + } + _, err := io.CopyN(io.Discard, drsb, n) + return drsb.offset, err + case io.SeekCurrent: + _, err := io.CopyN(io.Discard, drsb, offset) + return drsb.offset, err + default: + return 0, errors.New("unsupported whence: io.SeekEnd") + } +} + +func (ras *readerAtSeeker) Read(p []byte) (n int, err error) { + ras.mu.Lock() + defer ras.mu.Unlock() + n, err = ras.ra.ReadAt(p, ras.position) + ras.position += int64(n) + return n, err +} + +func (ras *readerAtSeeker) Seek(offset int64, whence int) (int64, error) { + ras.mu.Lock() + defer ras.mu.Unlock() + switch whence { + case io.SeekStart: + ras.position = offset + case io.SeekCurrent: + ras.position += offset + case io.SeekEnd: + return 0, errors.New("unsupported whence: io.SeekEnd") + default: + return 0, errors.New("unsupported whence") + } + return ras.position, nil +} + +func (rsa *readSeekerAt) ReadAt(p []byte, off int64) (n int, err error) { + rsa.mu.Lock() + defer rsa.mu.Unlock() + if _, err := rsa.rs.Seek(off, io.SeekStart); err != nil { + return 0, err + } + return rsa.rs.Read(p) +} diff --git a/ipld/car/v2/internal/io/offset_read_seeker.go b/ipld/car/v2/internal/io/offset_read_seeker.go new file mode 100644 index 0000000000..b3899ab784 --- /dev/null +++ b/ipld/car/v2/internal/io/offset_read_seeker.go @@ -0,0 +1,122 @@ +package io + +import ( + "errors" + "io" +) + +var ( + _ io.ReaderAt = (*offsetReadSeeker)(nil) + _ io.ReadSeeker = (*offsetReadSeeker)(nil) +) + +// offsetReadSeeker implements Read, and ReadAt on a section +// of an underlying io.ReaderAt. +// The main difference between io.SectionReader and offsetReadSeeker is that +// NewOffsetReadSeeker does not require the user to know the number of readable bytes. +// +// It also partially implements Seek, where the implementation panics if io.SeekEnd is passed. +// This is because, offsetReadSeeker does not know the end of the file therefore cannot seek relative +// to it. +type offsetReadSeeker struct { + r io.ReaderAt + base int64 + off int64 + b [1]byte // avoid alloc in ReadByte +} + +type ReadSeekerAt interface { + io.Reader + io.ReaderAt + io.Seeker + io.ByteReader +} + +// NewOffsetReadSeeker returns an ReadSeekerAt that reads from r +// starting offset offset off and stops with io.EOF when r reaches its end. +// The Seek function will panic if whence io.SeekEnd is passed. +func NewOffsetReadSeeker(r io.ReaderAt, off int64) (ReadSeekerAt, error) { + if or, ok := r.(*offsetReadSeeker); ok { + oldBase := or.base + newBase := or.base + off + if newBase < oldBase { + return nil, errors.New("NewOffsetReadSeeker overflow int64") + } + return &offsetReadSeeker{ + r: or.r, + base: newBase, + off: newBase, + }, nil + } + return &offsetReadSeeker{ + r: r, + base: off, + off: off, + }, nil +} + +func (o *offsetReadSeeker) Read(p []byte) (n int, err error) { + n, err = o.r.ReadAt(p, o.off) + oldOffset := o.off + off := oldOffset + int64(n) + if off < oldOffset { + return 0, errors.New("ReadAt offset overflow") + } + o.off = off + return +} + +func (o *offsetReadSeeker) ReadAt(p []byte, off int64) (n int, err error) { + if off < 0 { + return 0, io.EOF + } + oldOffset := off + off += o.base + if off < oldOffset { + return 0, errors.New("ReadAt offset overflow") + } + return o.r.ReadAt(p, off) +} + +func (o *offsetReadSeeker) ReadByte() (byte, error) { + _, err := o.Read(o.b[:]) + return o.b[0], err +} + +func (o *offsetReadSeeker) Offset() int64 { + return o.off +} + +func (o *offsetReadSeeker) Seek(offset int64, whence int) (int64, error) { + switch whence { + case io.SeekStart: + oldOffset := offset + off := offset + o.base + if off < oldOffset { + return 0, errors.New("Seek offset overflow") + } + o.off = off + case io.SeekCurrent: + oldOffset := o.off + if offset < 0 { + if -offset > oldOffset { + return 0, errors.New("Seek offset underflow") + } + o.off = oldOffset + offset + } else { + off := oldOffset + offset + if off < oldOffset { + return 0, errors.New("Seek offset overflow") + } + o.off = off + } + case io.SeekEnd: + panic("unsupported whence: SeekEnd") + } + return o.Position(), nil +} + +// Position returns the current position of this reader relative to the initial offset. +func (o *offsetReadSeeker) Position() int64 { + return o.off - o.base +} diff --git a/ipld/car/v2/internal/io/offset_write_seeker.go b/ipld/car/v2/internal/io/offset_write_seeker.go new file mode 100644 index 0000000000..7e0f6ba58e --- /dev/null +++ b/ipld/car/v2/internal/io/offset_write_seeker.go @@ -0,0 +1,41 @@ +package io + +import "io" + +var ( + _ io.Writer = (*OffsetWriteSeeker)(nil) + _ io.WriteSeeker = (*OffsetWriteSeeker)(nil) +) + +type OffsetWriteSeeker struct { + w io.WriterAt + base int64 + offset int64 +} + +func NewOffsetWriter(w io.WriterAt, off int64) *OffsetWriteSeeker { + return &OffsetWriteSeeker{w, off, off} +} + +func (ow *OffsetWriteSeeker) Write(b []byte) (n int, err error) { + n, err = ow.w.WriteAt(b, ow.offset) + ow.offset += int64(n) + return +} + +func (ow *OffsetWriteSeeker) Seek(offset int64, whence int) (int64, error) { + switch whence { + case io.SeekStart: + ow.offset = offset + ow.base + case io.SeekCurrent: + ow.offset += offset + case io.SeekEnd: + panic("unsupported whence: SeekEnd") + } + return ow.Position(), nil +} + +// Position returns the current position of this writer relative to the initial offset, i.e. the number of bytes written. +func (ow *OffsetWriteSeeker) Position() int64 { + return ow.offset - ow.base +} diff --git a/ipld/car/v2/internal/loader/counting_loader.go b/ipld/car/v2/internal/loader/counting_loader.go new file mode 100644 index 0000000000..e428993ea4 --- /dev/null +++ b/ipld/car/v2/internal/loader/counting_loader.go @@ -0,0 +1,61 @@ +package loader + +import ( + "bytes" + "io" + + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/linking" + "github.com/multiformats/go-varint" +) + +// counter tracks how much data has been read. +type counter struct { + totalRead uint64 +} + +func (c *counter) Size() uint64 { + return c.totalRead +} + +// ReadCounter provides an externally consumable interface to the +// additional data tracked about the linksystem. +type ReadCounter interface { + Size() uint64 +} + +type countingReader struct { + r io.Reader + c *counter +} + +func (c *countingReader) Read(p []byte) (int, error) { + n, err := c.r.Read(p) + c.c.totalRead += uint64(n) + return n, err +} + +// CountingLinkSystem wraps an ipld linksystem with to track the size of +// data loaded in a `counter` object. Each time nodes are loaded from the +// link system which trigger block reads, the size of the block as it would +// appear in a CAR file is added to the counter (included the size of the +// CID and the varint length for the block data). +func CountingLinkSystem(ls ipld.LinkSystem) (ipld.LinkSystem, ReadCounter) { + c := counter{} + clc := ls + clc.StorageReadOpener = func(lc linking.LinkContext, l ipld.Link) (io.Reader, error) { + r, err := ls.StorageReadOpener(lc, l) + if err != nil { + return nil, err + } + buf := bytes.NewBuffer(nil) + n, err := buf.ReadFrom(r) + if err != nil { + return nil, err + } + size := varint.ToUvarint(uint64(n) + uint64(len(l.Binary()))) + c.totalRead += uint64(len(size)) + uint64(len(l.Binary())) + return &countingReader{buf, &c}, nil + } + return clc, &c +} diff --git a/ipld/car/v2/internal/loader/writing_loader.go b/ipld/car/v2/internal/loader/writing_loader.go new file mode 100644 index 0000000000..0bf2ae3c8a --- /dev/null +++ b/ipld/car/v2/internal/loader/writing_loader.go @@ -0,0 +1,126 @@ +package loader + +import ( + "bytes" + "io" + + "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/linking" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-varint" +) + +type writerOutput struct { + w io.Writer + size uint64 + code multicodec.Code + rcrds map[cid.Cid]index.Record +} + +func (w *writerOutput) Size() uint64 { + return w.size +} + +func (w *writerOutput) Index() (index.Index, error) { + idx, err := index.New(w.code) + if err != nil { + return nil, err + } + rcrds := make([]index.Record, 0, len(w.rcrds)) + for _, r := range w.rcrds { + rcrds = append(rcrds, r) + } + if err := idx.Load(rcrds); err != nil { + return nil, err + } + + return idx, nil +} + +// An IndexTracker tracks the records loaded/written, calculate an +// index based on them. +type IndexTracker interface { + ReadCounter + Index() (index.Index, error) +} + +type writingReader struct { + r io.Reader + len int64 + cid string + wo *writerOutput +} + +func (w *writingReader) Read(p []byte) (int, error) { + if w.wo != nil { + // write the cid + size := varint.ToUvarint(uint64(w.len) + uint64(len(w.cid))) + if _, err := w.wo.w.Write(size); err != nil { + return 0, err + } + if _, err := w.wo.w.Write([]byte(w.cid)); err != nil { + return 0, err + } + cpy := bytes.NewBuffer(w.r.(*bytes.Buffer).Bytes()) + if _, err := cpy.WriteTo(w.wo.w); err != nil { + return 0, err + } + _, c, err := cid.CidFromBytes([]byte(w.cid)) + if err != nil { + return 0, err + } + w.wo.rcrds[c] = index.Record{ + Cid: c, + Offset: w.wo.size, + } + w.wo.size += uint64(w.len) + uint64(len(size)+len(w.cid)) + + w.wo = nil + } + + return w.r.Read(p) +} + +// TeeingLinkSystem wraps an IPLD.LinkSystem so that each time a block is loaded from it, +// that block is also written as a CAR block to the provided io.Writer. Metadata +// (the size of data written) is provided in the second return value. +// The `initialOffset` is used to calculate the offsets recorded for the index, and will be +// +// included in the `.Size()` of the IndexTracker. +// +// An indexCodec of `index.CarIndexNoIndex` can be used to not track these offsets. +func TeeingLinkSystem(ls ipld.LinkSystem, w io.Writer, initialOffset uint64, indexCodec multicodec.Code) (ipld.LinkSystem, IndexTracker) { + wo := writerOutput{ + w: w, + size: initialOffset, + code: indexCodec, + rcrds: make(map[cid.Cid]index.Record), + } + + tls := ls + tls.StorageReadOpener = func(lc linking.LinkContext, l ipld.Link) (io.Reader, error) { + _, c, err := cid.CidFromBytes([]byte(l.Binary())) + if err != nil { + return nil, err + } + + // if we've already read this cid in this session, don't re-write it. + if _, ok := wo.rcrds[c]; ok { + return ls.StorageReadOpener(lc, l) + } + + r, err := ls.StorageReadOpener(lc, l) + if err != nil { + return nil, err + } + buf := bytes.NewBuffer(nil) + n, err := buf.ReadFrom(r) + if err != nil { + return nil, err + } + return &writingReader{buf, n, l.Binary(), &wo}, nil + } + return tls, &wo +} diff --git a/ipld/car/v2/internal/store/identity.go b/ipld/car/v2/internal/store/identity.go new file mode 100644 index 0000000000..d61a57c48d --- /dev/null +++ b/ipld/car/v2/internal/store/identity.go @@ -0,0 +1,17 @@ +package store + +import ( + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multihash" +) + +// IsIdentity inspects the CID and determines whether it is an IDENTITY CID. +func IsIdentity(key cid.Cid) (digest []byte, ok bool, err error) { + dmh, err := multihash.Decode(key.Hash()) + if err != nil { + return nil, false, err + } + ok = dmh.Code == multihash.IDENTITY + digest = dmh.Digest + return digest, ok, nil +} diff --git a/ipld/car/v2/internal/store/index.go b/ipld/car/v2/internal/store/index.go new file mode 100644 index 0000000000..62c9d2a02a --- /dev/null +++ b/ipld/car/v2/internal/store/index.go @@ -0,0 +1,108 @@ +package store + +import ( + "bytes" + "io" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1/util" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-varint" +) + +// FindCid can be used to either up the existence, size and offset of a block +// if it exists in CAR as specified by the index; and optionally the data bytes +// of the block. +func FindCid( + reader io.ReaderAt, + idx index.Index, + key cid.Cid, + useWholeCids bool, + zeroLenAsEOF bool, + maxReadBytes uint64, + readBytes bool, +) ([]byte, int64, int, error) { + + var fnData []byte + var fnOffset int64 + var fnLen int = -1 + var fnErr error + err := idx.GetAll(key, func(offset uint64) bool { + reader, err := internalio.NewOffsetReadSeeker(reader, int64(offset)) + if err != nil { + fnErr = err + return false + } + var readCid cid.Cid + if readBytes { + readCid, fnData, err = util.ReadNode(reader, zeroLenAsEOF, maxReadBytes) + if err != nil { + fnErr = err + return false + } + fnLen = len(fnData) + } else { + sectionLen, err := varint.ReadUvarint(reader) + if err != nil { + fnErr = err + return false + } + var cidLen int + cidLen, readCid, err = cid.CidFromReader(reader) + if err != nil { + fnErr = err + return false + } + fnLen = int(sectionLen) - cidLen + fnOffset = int64(offset) + reader.(interface{ Position() int64 }).Position() + } + if useWholeCids { + if !readCid.Equals(key) { + fnLen = -1 + return true // continue looking + } + return false + } else { + if !bytes.Equal(readCid.Hash(), key.Hash()) { + // weird, bad index, continue looking + fnLen = -1 + return true + } + return false + } + }) + if err != nil { + return nil, -1, -1, err + } + if fnErr != nil { + return nil, -1, -1, fnErr + } + if fnLen == -1 { + return nil, -1, -1, index.ErrNotFound + } + return fnData, fnOffset, fnLen, nil +} + +// Finalize will write the index to the writer at the offset specified in the header. It should only +// be used for a CARv2 and when the CAR interface is being closed. +func Finalize(writer io.WriterAt, header carv2.Header, idx *InsertionIndex, dataSize uint64, storeIdentityCIDs bool, indexCodec multicodec.Code) error { + // TODO check if add index option is set and don't write the index then set index offset to zero. + header = header.WithDataSize(dataSize) + header.Characteristics.SetFullyIndexed(storeIdentityCIDs) + + // TODO if index not needed don't bother flattening it. + fi, err := idx.Flatten(indexCodec) + if err != nil { + return err + } + if _, err := index.WriteTo(fi, internalio.NewOffsetWriter(writer, int64(header.IndexOffset))); err != nil { + return err + } + if _, err := header.WriteTo(internalio.NewOffsetWriter(writer, carv2.PragmaSize)); err != nil { + return err + } + return nil +} diff --git a/ipld/car/v2/internal/store/indexcheck.go b/ipld/car/v2/internal/store/indexcheck.go new file mode 100644 index 0000000000..04b673e05c --- /dev/null +++ b/ipld/car/v2/internal/store/indexcheck.go @@ -0,0 +1,84 @@ +package store + +import ( + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" +) + +// ShouldPut returns true if the block should be put into the CAR according to the options provided +// and the index. It returns false if the block should not be put into the CAR, either because it +// is an identity block and StoreIdentityCIDs is false, or because it already exists and +// BlockstoreAllowDuplicatePuts is false. +func ShouldPut( + idx *InsertionIndex, + c cid.Cid, + maxIndexCidSize uint64, + storeIdentityCIDs bool, + blockstoreAllowDuplicatePuts bool, + blockstoreUseWholeCIDs bool, +) (bool, error) { + + // If StoreIdentityCIDs option is disabled then treat IDENTITY CIDs like IdStore. + if !storeIdentityCIDs { + // Check for IDENTITY CID. If IDENTITY, ignore and move to the next block. + if _, ok, err := IsIdentity(c); err != nil { + return false, err + } else if ok { + return false, nil + } + } + + // Check if its size is too big. + // If larger than maximum allowed size, return error. + // Note, we need to check this regardless of whether we have IDENTITY CID or not. + // Since multhihash codes other than IDENTITY can result in large digests. + cSize := uint64(len(c.Bytes())) + if cSize > maxIndexCidSize { + return false, &carv2.ErrCidTooLarge{MaxSize: maxIndexCidSize, CurrentSize: cSize} + } + + if !blockstoreAllowDuplicatePuts { + if blockstoreUseWholeCIDs { + has, err := idx.HasExactCID(c) + if err != nil { + return false, err + } + return !has, nil // deduplicated by CID + } + if !blockstoreUseWholeCIDs { + _, err := idx.Get(c) + if err == nil { + return false, nil // deduplicated by hash + } + } + } + + return true, nil +} + +// Has returns true if the block exists in in the store according to the various +// rules associated with the options. Similar to ShouldPut, but for the simpler +// Has() case. +func Has( + idx *InsertionIndex, + c cid.Cid, + maxIndexCidSize uint64, + storeIdentityCIDs bool, + blockstoreAllowDuplicatePuts bool, + blockstoreUseWholeCIDs bool, +) (bool, error) { + + // If StoreIdentityCIDs option is disabled then treat IDENTITY CIDs like IdStore. + if !storeIdentityCIDs { + if _, ok, err := IsIdentity(c); err != nil { + return false, err + } else if ok { + return true, nil + } + } + + if blockstoreUseWholeCIDs { + return idx.HasExactCID(c) + } + return idx.HasMultihash(c.Hash()) +} diff --git a/ipld/car/v2/internal/store/insertionindex.go b/ipld/car/v2/internal/store/insertionindex.go new file mode 100644 index 0000000000..bc25fffbcf --- /dev/null +++ b/ipld/car/v2/internal/store/insertionindex.go @@ -0,0 +1,264 @@ +package store + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + + "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/petar/GoLLRB/llrb" + cbor "github.com/whyrusleeping/cbor/go" +) + +// This index is intended to be efficient for random-access, in-memory lookups +// and is not intended to be an index type that is attached to a CARv2. +// See flatten() for conversion of this data to a known, existing index type. + +var ( + errUnsupported = errors.New("not supported") + insertionIndexCodec = multicodec.Code(0x300003) +) + +type InsertionIndex struct { + items llrb.LLRB +} + +func NewInsertionIndex() *InsertionIndex { + return &InsertionIndex{} +} + +type recordDigest struct { + digest []byte + index.Record +} + +func (r recordDigest) Less(than llrb.Item) bool { + other, ok := than.(recordDigest) + if !ok { + return false + } + return bytes.Compare(r.digest, other.digest) < 0 +} + +func newRecordDigest(r index.Record) recordDigest { + d, err := multihash.Decode(r.Hash()) + if err != nil { + panic(err) + } + + return recordDigest{d.Digest, r} +} + +func newRecordFromCid(c cid.Cid, at uint64) recordDigest { + d, err := multihash.Decode(c.Hash()) + if err != nil { + panic(err) + } + + return recordDigest{d.Digest, index.Record{Cid: c, Offset: at}} +} + +func (ii *InsertionIndex) InsertNoReplace(key cid.Cid, n uint64) { + ii.items.InsertNoReplace(newRecordFromCid(key, n)) +} + +func (ii *InsertionIndex) Get(c cid.Cid) (uint64, error) { + record, err := ii.getRecord(c) + if err != nil { + return 0, err + } + return record.Offset, nil +} + +func (ii *InsertionIndex) getRecord(c cid.Cid) (index.Record, error) { + d, err := multihash.Decode(c.Hash()) + if err != nil { + return index.Record{}, err + } + entry := recordDigest{digest: d.Digest} + e := ii.items.Get(entry) + if e == nil { + return index.Record{}, index.ErrNotFound + } + r, ok := e.(recordDigest) + if !ok { + return index.Record{}, errUnsupported + } + + return r.Record, nil +} + +func (ii *InsertionIndex) GetAll(c cid.Cid, fn func(uint64) bool) error { + d, err := multihash.Decode(c.Hash()) + if err != nil { + return err + } + entry := recordDigest{digest: d.Digest} + + any := false + iter := func(i llrb.Item) bool { + existing := i.(recordDigest) + if !bytes.Equal(existing.digest, entry.digest) { + // We've already looked at all entries with matching digests. + return false + } + any = true + return fn(existing.Record.Offset) + } + ii.items.AscendGreaterOrEqual(entry, iter) + if !any { + return index.ErrNotFound + } + return nil +} + +func (ii *InsertionIndex) Marshal(w io.Writer) (uint64, error) { + l := uint64(0) + if err := binary.Write(w, binary.LittleEndian, int64(ii.items.Len())); err != nil { + return l, err + } + l += 8 + + var err error + iter := func(i llrb.Item) bool { + if err = cbor.Encode(w, i.(recordDigest).Record); err != nil { + return false + } + return true + } + ii.items.AscendGreaterOrEqual(ii.items.Min(), iter) + return l, err +} + +func (ii *InsertionIndex) Unmarshal(r io.Reader) error { + var length int64 + if err := binary.Read(r, binary.LittleEndian, &length); err != nil { + return err + } + d := cbor.NewDecoder(r) + for i := int64(0); i < length; i++ { + var rec index.Record + if err := d.Decode(&rec); err != nil { + return err + } + ii.items.InsertNoReplace(newRecordDigest(rec)) + } + return nil +} + +func (ii *InsertionIndex) ForEach(f func(multihash.Multihash, uint64) error) error { + var err error + ii.items.AscendGreaterOrEqual(ii.items.Min(), func(i llrb.Item) bool { + r := i.(recordDigest).Record + err = f(r.Cid.Hash(), r.Offset) + return err == nil + }) + return err +} + +func (ii *InsertionIndex) ForEachCid(f func(cid.Cid, uint64) error) error { + var err error + ii.items.AscendGreaterOrEqual(ii.items.Min(), func(i llrb.Item) bool { + r := i.(recordDigest).Record + err = f(r.Cid, r.Offset) + return err == nil + }) + return err +} + +func (ii *InsertionIndex) Codec() multicodec.Code { + return insertionIndexCodec +} + +func (ii *InsertionIndex) Load(rs []index.Record) error { + for _, r := range rs { + rec := newRecordDigest(r) + if rec.digest == nil { + return fmt.Errorf("invalid entry: %v", r) + } + ii.items.InsertNoReplace(rec) + } + return nil +} + +// flatten returns a formatted index in the given codec for more efficient subsequent loading. +func (ii *InsertionIndex) Flatten(codec multicodec.Code) (index.Index, error) { + si, err := index.New(codec) + if err != nil { + return nil, err + } + rcrds := make([]index.Record, ii.items.Len()) + + idx := 0 + iter := func(i llrb.Item) bool { + rcrds[idx] = i.(recordDigest).Record + idx++ + return true + } + ii.items.AscendGreaterOrEqual(ii.items.Min(), iter) + + if err := si.Load(rcrds); err != nil { + return nil, err + } + return si, nil +} + +// note that hasExactCID is very similar to GetAll, +// but it's separate as it allows us to compare Record.Cid directly, +// whereas GetAll just provides Record.Offset. + +func (ii *InsertionIndex) HasExactCID(c cid.Cid) (bool, error) { + d, err := multihash.Decode(c.Hash()) + if err != nil { + return false, err + } + entry := recordDigest{digest: d.Digest} + + found := false + iter := func(i llrb.Item) bool { + existing := i.(recordDigest) + if !bytes.Equal(existing.digest, entry.digest) { + // We've already looked at all entries with matching digests. + return false + } + if existing.Record.Cid == c { + // We found an exact match. + found = true + return false + } + // Continue looking in ascending order. + return true + } + ii.items.AscendGreaterOrEqual(entry, iter) + return found, nil +} + +func (ii *InsertionIndex) HasMultihash(mh multihash.Multihash) (bool, error) { + d, err := multihash.Decode(mh) + if err != nil { + return false, err + } + entry := recordDigest{digest: d.Digest} + + found := false + iter := func(i llrb.Item) bool { + existing := i.(recordDigest) + if !bytes.Equal(existing.digest, entry.digest) { + // We've already looked at all entries with matching digests. + return false + } + if bytes.Equal(existing.Record.Cid.Hash(), mh) { + found = true + return false + } + // Continue looking in ascending order. + return true + } + ii.items.AscendGreaterOrEqual(entry, iter) + return found, nil +} diff --git a/ipld/car/v2/internal/store/resume.go b/ipld/car/v2/internal/store/resume.go new file mode 100644 index 0000000000..ff47223733 --- /dev/null +++ b/ipld/car/v2/internal/store/resume.go @@ -0,0 +1,198 @@ +package store + +import ( + "errors" + "fmt" + "io" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/internal/carv1" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-varint" +) + +type ReaderWriterAt interface { + io.ReaderAt + io.WriterAt +} + +// ResumableVersion performs two tasks - check if there is a valid header at the start of the, +// reader, then check whether the version of that header matches what we expect. +func ResumableVersion(reader io.Reader, writeAsV1 bool) error { + version, err := carv2.ReadVersion(reader) + if err != nil { + // The file is not a valid CAR file and cannot resume from it. + // Or the write must have failed before pragma was written. + return err + } + + switch { + case version == 1 && writeAsV1: + case version == 2 && !writeAsV1: + default: + // The file is not the expected version and we cannot resume from it. + return fmt.Errorf("cannot resume on CAR file with version %v", version) + } + return nil +} + +// Resume will attempt to resume a CARv2 or CARv1 file by checking that there exists an existing +// CAR and that the CAR header details match what is being requested for resumption. +// Resumption of a CARv2 involves "unfinalizing" the header by resetting it back to a bare state +// and then truncating the file to remove the index. Truncation is important because it allows a +// non-finalized CARv2 to be resumed from as the header won't contain the DataSize of the payload +// body and if the file also contains an index, we cannot determine the end of the payload. +// Therefore, when using a resumed, existing and finalized, CARv2, whose body may not extend +// beyond the index and then closing without finalization (e.g. due to a crash), the file will no +// longer be parseable because we won't have DataSize, and we won't be able to determine it by +// parsing the payload to EOF. +func Resume( + rw ReaderWriterAt, + dataReader io.ReaderAt, + dataWriter *internalio.OffsetWriteSeeker, + idx *InsertionIndex, + roots []cid.Cid, + dataOffset uint64, + v1 bool, + maxAllowedHeaderSize uint64, + zeroLengthSectionAsEOF bool, +) error { + + var headerInFile carv2.Header + var v1r internalio.ReadSeekerAt + + if !v1 { + if _, ok := rw.(interface{ Truncate(size int64) error }); !ok { + return fmt.Errorf("cannot resume a CARv2 without the ability to truncate (e.g. an io.File)") + } + + // Check if file was finalized by trying to read the CARv2 header. + // We check because if finalized the CARv1 reader behaviour needs to be adjusted since + // EOF will not signify end of CARv1 payload. i.e. index is most likely present. + r, err := internalio.NewOffsetReadSeeker(rw, carv2.PragmaSize) + if err != nil { + return err + } + _, err = headerInFile.ReadFrom(r) + + // If reading CARv2 header succeeded, and CARv1 offset in header is not zero then the file is + // most-likely finalized. Check padding and truncate the file to remove index. + // Otherwise, carry on reading the v1 payload at offset determined from b.header. + if err == nil && headerInFile.DataOffset != 0 { + if headerInFile.DataOffset != dataOffset { + // Assert that the padding on file matches the given WithDataPadding option. + wantPadding := headerInFile.DataOffset - carv2.PragmaSize - carv2.HeaderSize + gotPadding := dataOffset - carv2.PragmaSize - carv2.HeaderSize + return fmt.Errorf( + "cannot resume from file with mismatched CARv1 offset; "+ + "`WithDataPadding` option must match the padding on file. "+ + "Expected padding value of %v but got %v", wantPadding, gotPadding, + ) + } else if headerInFile.DataSize == 0 { + // If CARv1 size is zero, since CARv1 offset wasn't, then the CARv2 header was + // most-likely partially written. Since we write the header last in Finalize then the + // file most-likely contains the index and we cannot know where it starts, therefore + // can't resume. + return errors.New("corrupt CARv2 header; cannot resume from file") + } + } + + v1r, err = internalio.NewOffsetReadSeeker(dataReader, 0) + if err != nil { + return err + } + } else { + var err error + v1r, err = internalio.NewOffsetReadSeeker(rw, 0) + if err != nil { + return err + } + } + + header, err := carv1.ReadHeader(v1r, maxAllowedHeaderSize) + if err != nil { + // Cannot read the CARv1 header; the file is most likely corrupt. + return fmt.Errorf("error reading car header: %w", err) + } + if !header.Matches(carv1.CarHeader{Roots: roots, Version: 1}) { + // Cannot resume if version and root does not match. + return errors.New("cannot resume on file with mismatching data header") + } + + if headerInFile.DataOffset != 0 { + // If header in file contains the size of car v1, then the index is most likely present. + // Since we will need to re-generate the index, as the one in file is flattened, truncate + // the file so that the Readonly.backing has the right set of bytes to deal with. + // This effectively means resuming from a finalized file will wipe its index even if there + // are no blocks put unless the user calls finalize. + if err := rw.(interface{ Truncate(size int64) error }).Truncate(int64(headerInFile.DataOffset + headerInFile.DataSize)); err != nil { + return err + } + } + + if !v1 { + // Now that CARv2 header is present on file, clear it to avoid incorrect size and offset in + // header in case blocksotre is closed without finalization and is resumed from. + wat, ok := rw.(io.WriterAt) + if !ok { // how would we get this far?? + return errors.New("cannot resume from file without io.WriterAt") + } + if _, err := new(carv2.Header).WriteTo(internalio.NewOffsetWriter(wat, carv2.PragmaSize)); err != nil { + return fmt.Errorf("could not un-finalize: %w", err) + } + } + + // TODO See how we can reduce duplicate code here. + // The code here comes from car.GenerateIndex. + // Copied because we need to populate an insertindex, not a sorted index. + // Producing a sorted index via generate, then converting it to insertindex is not possible. + // Because Index interface does not expose internal records. + // This may be done as part of https://github.com/ipld/go-car/issues/95 + + offset, err := carv1.HeaderSize(header) + if err != nil { + return err + } + sectionOffset := int64(0) + if sectionOffset, err = v1r.Seek(int64(offset), io.SeekStart); err != nil { + return err + } + + for { + // Grab the length of the section. + // Note that ReadUvarint wants a ByteReader. + length, err := varint.ReadUvarint(v1r) + if err != nil { + if err == io.EOF { + break + } + return err + } + + // Null padding; by default it's an error. + if length == 0 { + if zeroLengthSectionAsEOF { + break + } else { + return fmt.Errorf("carv1 null padding not allowed by default; see WithZeroLegthSectionAsEOF") + } + } + + // Grab the CID. + n, c, err := cid.CidFromReader(v1r) + if err != nil { + return err + } + idx.InsertNoReplace(c, uint64(sectionOffset)) + + // Seek to the next section by skipping the block. + // The section length includes the CID, so subtract it. + if sectionOffset, err = v1r.Seek(int64(length)-int64(n), io.SeekCurrent); err != nil { + return err + } + } + // Seek to the end of last skipped block where the writer should resume writing. + _, err = dataWriter.Seek(sectionOffset, io.SeekStart) + return err +} diff --git a/ipld/car/v2/options.go b/ipld/car/v2/options.go new file mode 100644 index 0000000000..e28589e890 --- /dev/null +++ b/ipld/car/v2/options.go @@ -0,0 +1,240 @@ +package car + +import ( + "math" + + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-ipld-prime/traversal" + "github.com/multiformats/go-multicodec" + + "github.com/ipld/go-car/v2/internal/carv1" +) + +// DefaultMaxIndexCidSize specifies the maximum size in byptes accepted as a section CID by CARv2 index. +const DefaultMaxIndexCidSize = 2 << 10 // 2 KiB + +// DefaultMaxAllowedHeaderSize specifies the default maximum size that a CARv1 +// decode (including within a CARv2 container) will allow a header to be without +// erroring. This is to prevent OOM errors where a header prefix includes a +// too-large size specifier. +// Currently set to 32 MiB. +const DefaultMaxAllowedHeaderSize = carv1.DefaultMaxAllowedHeaderSize + +// DefaultMaxAllowedHeaderSize specifies the default maximum size that a CARv1 +// decode (including within a CARv2 container) will allow a section to be +// without erroring. This is to prevent OOM errors where a section prefix +// includes a too-large size specifier. +// Typically IPLD blocks should be under 2 MiB (ideally under 1 MiB), so unless +// atypical data is expected, this should not be a large value. +// Currently set to 8 MiB. +const DefaultMaxAllowedSectionSize = carv1.DefaultMaxAllowedSectionSize + +// Option describes an option which affects behavior when interacting with CAR files. +type Option func(*Options) + +// ReadOption hints that an API wants options related only to reading CAR files. +type ReadOption = Option + +// WriteOption hints that an API wants options related only to reading CAR files. +type WriteOption = Option + +// ReadWriteOption is either a ReadOption or a WriteOption. +// Deprecated: use Option instead. +type ReadWriteOption = Option + +// Options holds the configured options after applying a number of +// Option funcs. +// +// This type should not be used directly by end users; it's only exposed as a +// side effect of Option. +type Options struct { + DataPadding uint64 + IndexPadding uint64 + IndexCodec multicodec.Code + ZeroLengthSectionAsEOF bool + MaxIndexCidSize uint64 + StoreIdentityCIDs bool + + BlockstoreAllowDuplicatePuts bool + BlockstoreUseWholeCIDs bool + MaxTraversalLinks uint64 + WriteAsCarV1 bool + TraversalPrototypeChooser traversal.LinkTargetNodePrototypeChooser + TrustedCAR bool + + MaxAllowedHeaderSize uint64 + MaxAllowedSectionSize uint64 +} + +// ApplyOptions applies given opts and returns the resulting Options. +// This function should not be used directly by end users; it's only exposed as a +// side effect of Option. +func ApplyOptions(opt ...Option) Options { + opts := Options{ + MaxTraversalLinks: math.MaxInt64, //default: traverse all + MaxAllowedHeaderSize: carv1.DefaultMaxAllowedHeaderSize, + MaxAllowedSectionSize: carv1.DefaultMaxAllowedSectionSize, + } + for _, o := range opt { + o(&opts) + } + // Set defaults for zero valued fields. + if opts.IndexCodec == 0 { + opts.IndexCodec = multicodec.CarMultihashIndexSorted + } + if opts.MaxIndexCidSize == 0 { + opts.MaxIndexCidSize = DefaultMaxIndexCidSize + } + return opts +} + +// ZeroLengthSectionAsEOF sets whether to allow the CARv1 decoder to treat +// a zero-length section as the end of the input CAR file. For example, this can +// be useful to allow "null padding" after a CARv1 without knowing where the +// padding begins. +func ZeroLengthSectionAsEOF(enable bool) Option { + return func(o *Options) { + o.ZeroLengthSectionAsEOF = enable + } +} + +// UseDataPadding sets the padding to be added between CARv2 header and its data payload on Finalize. +func UseDataPadding(p uint64) Option { + return func(o *Options) { + o.DataPadding = p + } +} + +// UseIndexPadding sets the padding between data payload and its index on Finalize. +func UseIndexPadding(p uint64) Option { + return func(o *Options) { + o.IndexPadding = p + } +} + +// UseIndexCodec sets the codec used for index generation. +func UseIndexCodec(c multicodec.Code) Option { + return func(o *Options) { + o.IndexCodec = c + } +} + +// WithoutIndex flags that no index should be included in generation. +func WithoutIndex() Option { + return func(o *Options) { + o.IndexCodec = index.CarIndexNone + } +} + +// StoreIdentityCIDs sets whether to persist sections that are referenced by +// CIDs with multihash.IDENTITY digest. +// When writing CAR files with this option, Characteristics.IsFullyIndexed will +// be set. +// +// By default, the blockstore interface will always return true for Has() called +// with identity CIDs, but when this option is turned on, it will defer to the +// index. +// +// When creating an index (or loading a CARv1 as a blockstore), when this option +// is on, identity CIDs will be included in the index. +// +// This option is disabled by default. +func StoreIdentityCIDs(b bool) Option { + return func(o *Options) { + o.StoreIdentityCIDs = b + } +} + +// MaxIndexCidSize specifies the maximum allowed size for indexed CIDs in bytes. +// Indexing a CID with larger than the allowed size results in ErrCidTooLarge error. +func MaxIndexCidSize(s uint64) Option { + return func(o *Options) { + o.MaxIndexCidSize = s + } +} + +// WithTraversalPrototypeChooser specifies the prototype chooser that should be used +// when performing traversals in writes from a linksystem. +func WithTraversalPrototypeChooser(t traversal.LinkTargetNodePrototypeChooser) Option { + return func(o *Options) { + o.TraversalPrototypeChooser = t + } +} + +// WithTrustedCAR specifies whether CIDs match the block data as they are read +// from the CAR files. +func WithTrustedCAR(t bool) Option { + return func(o *Options) { + o.TrustedCAR = t + } +} + +// MaxAllowedHeaderSize overrides the default maximum size (of 32 MiB) that a +// CARv1 decode (including within a CARv2 container) will allow a header to be +// without erroring. +func MaxAllowedHeaderSize(max uint64) Option { + return func(o *Options) { + o.MaxAllowedHeaderSize = max + } +} + +// MaxAllowedSectionSize overrides the default maximum size (of 8 MiB) that a +// CARv1 decode (including within a CARv2 container) will allow a header to be +// without erroring. +// Typically IPLD blocks should be under 2 MiB (ideally under 1 MiB), so unless +// atypical data is expected, this should not be a large value. +func MaxAllowedSectionSize(max uint64) Option { + return func(o *Options) { + o.MaxAllowedSectionSize = max + } +} + +// --------------------------------------------------- storage interface options + +// UseWholeCIDs is a read option which makes a CAR storage interface (blockstore +// or storage) identify blocks by whole CIDs, and not just their multihashes. +// The default is to use multihashes, which matches the current semantics of +// go-ipfs-blockstore v1. +// +// Enabling this option affects a number of methods, including read-only ones: +// +// • Get, Has, and HasSize will only return a block only if the entire CID is +// present in the CAR file. +// +// • AllKeysChan will return the original whole CIDs, instead of with their +// multicodec set to "raw" to just provide multihashes. +// +// • If AllowDuplicatePuts isn't set, Put and PutMany will deduplicate by the +// whole CID, allowing different CIDs with equal multihashes. +// +// Note that this option only affects the storage interfaces (blockstore +// or storage), and is ignored by the root go-car/v2 package. +func UseWholeCIDs(enable bool) Option { + return func(o *Options) { + o.BlockstoreUseWholeCIDs = enable + } +} + +// WriteAsCarV1 is a write option which makes a CAR interface (blockstore or +// storage) write the output as a CARv1 only, with no CARv2 header or index. +// Indexing is used internally during write but is discarded upon finalization. +// +// Note that this option only affects the storage interfaces (blockstore +// or storage), and is ignored by the root go-car/v2 package. +func WriteAsCarV1(asCarV1 bool) Option { + return func(o *Options) { + o.WriteAsCarV1 = asCarV1 + } +} + +// AllowDuplicatePuts is a write option which makes a CAR interface (blockstore +// or storage) not deduplicate blocks in Put and PutMany. The default is to +// deduplicate, which matches the current semantics of go-ipfs-blockstore v1. +// +// Note that this option only affects the storage interfaces (blockstore +// or storage), and is ignored by the root go-car/v2 package. +func AllowDuplicatePuts(allow bool) Option { + return func(o *Options) { + o.BlockstoreAllowDuplicatePuts = allow + } +} diff --git a/ipld/car/v2/options_test.go b/ipld/car/v2/options_test.go new file mode 100644 index 0000000000..6daa473fef --- /dev/null +++ b/ipld/car/v2/options_test.go @@ -0,0 +1,50 @@ +package car_test + +import ( + "math" + "testing" + + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + "github.com/multiformats/go-multicodec" + "github.com/stretchr/testify/require" +) + +func TestApplyOptions_SetsExpectedDefaults(t *testing.T) { + require.Equal(t, carv2.Options{ + IndexCodec: multicodec.CarMultihashIndexSorted, + MaxIndexCidSize: carv2.DefaultMaxIndexCidSize, + MaxTraversalLinks: math.MaxInt64, + MaxAllowedHeaderSize: 32 << 20, + MaxAllowedSectionSize: 8 << 20, + }, carv2.ApplyOptions()) +} + +func TestApplyOptions_AppliesOptions(t *testing.T) { + require.Equal(t, + carv2.Options{ + DataPadding: 123, + IndexPadding: 456, + IndexCodec: multicodec.CarIndexSorted, + ZeroLengthSectionAsEOF: true, + MaxIndexCidSize: 789, + StoreIdentityCIDs: true, + BlockstoreAllowDuplicatePuts: true, + BlockstoreUseWholeCIDs: true, + MaxTraversalLinks: math.MaxInt64, + MaxAllowedHeaderSize: 101, + MaxAllowedSectionSize: 202, + }, + carv2.ApplyOptions( + carv2.UseDataPadding(123), + carv2.UseIndexPadding(456), + carv2.UseIndexCodec(multicodec.CarIndexSorted), + carv2.ZeroLengthSectionAsEOF(true), + carv2.MaxIndexCidSize(789), + carv2.StoreIdentityCIDs(true), + carv2.MaxAllowedHeaderSize(101), + carv2.MaxAllowedSectionSize(202), + blockstore.AllowDuplicatePuts(true), + blockstore.UseWholeCIDs(true), + )) +} diff --git a/ipld/car/v2/reader.go b/ipld/car/v2/reader.go new file mode 100644 index 0000000000..ca6cad1e95 --- /dev/null +++ b/ipld/car/v2/reader.go @@ -0,0 +1,368 @@ +package car + +import ( + "errors" + "fmt" + "io" + "math" + + "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/carv1/util" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/multiformats/go-varint" + "golang.org/x/exp/mmap" +) + +// Reader represents a reader of CARv2. +type Reader struct { + Header Header + Version uint64 + r io.ReaderAt + roots []cid.Cid + opts Options + closer io.Closer +} + +// OpenReader is a wrapper for NewReader which opens the file at path. +func OpenReader(path string, opts ...Option) (*Reader, error) { + f, err := mmap.Open(path) + if err != nil { + return nil, err + } + + r, err := NewReader(f, opts...) + if err != nil { + return nil, err + } + + r.closer = f + return r, nil +} + +// NewReader constructs a new reader that reads either CARv1 or CARv2 from the given r. +// Upon instantiation, the reader inspects the payload and provides appropriate read operations +// for both CARv1 and CARv2. +// +// Note that any other version other than 1 or 2 will result in an error. The caller may use +// Reader.Version to get the actual version r represents. In the case where r represents a CARv1 +// Reader.Header will not be populated and is left as zero-valued. +func NewReader(r io.ReaderAt, opts ...Option) (*Reader, error) { + cr := &Reader{ + r: r, + } + cr.opts = ApplyOptions(opts...) + + or, err := internalio.NewOffsetReadSeeker(r, 0) + if err != nil { + return nil, err + } + cr.Version, err = ReadVersion(or, opts...) + if err != nil { + return nil, err + } + + if cr.Version != 1 && cr.Version != 2 { + return nil, fmt.Errorf("invalid car version: %d", cr.Version) + } + + if cr.Version == 2 { + if err := cr.readV2Header(); err != nil { + return nil, err + } + } + + return cr, nil +} + +// Roots returns the root CIDs. +// The root CIDs are extracted lazily from the data payload header. +func (r *Reader) Roots() ([]cid.Cid, error) { + if r.roots != nil { + return r.roots, nil + } + dr, err := r.DataReader() + if err != nil { + return nil, err + } + header, err := carv1.ReadHeader(dr, r.opts.MaxAllowedHeaderSize) + if err != nil { + return nil, err + } + r.roots = header.Roots + return r.roots, nil +} + +func (r *Reader) readV2Header() (err error) { + headerSection := io.NewSectionReader(r.r, PragmaSize, HeaderSize) + _, err = r.Header.ReadFrom(headerSection) + return +} + +// SectionReader implements both io.ReadSeeker and io.ReaderAt. +// It is the interface version of io.SectionReader, but note that the +// implementation is not guaranteed to be an io.SectionReader. +type SectionReader interface { + io.Reader + io.Seeker + io.ReaderAt +} + +// DataReader provides a reader containing the data payload in CARv1 format. +func (r *Reader) DataReader() (SectionReader, error) { + if r.Version == 2 { + return io.NewSectionReader(r.r, int64(r.Header.DataOffset), int64(r.Header.DataSize)), nil + } + return internalio.NewOffsetReadSeeker(r.r, 0) +} + +// IndexReader provides an io.Reader containing the index for the data payload if the index is +// present. Otherwise, returns nil. +// Note, this function will always return nil if the backing payload represents a CARv1. +func (r *Reader) IndexReader() (io.Reader, error) { + if r.Version == 1 || !r.Header.HasIndex() { + return nil, nil + } + return internalio.NewOffsetReadSeeker(r.r, int64(r.Header.IndexOffset)) +} + +// Stats is returned by an Inspect() call +type Stats struct { + Version uint64 + Header Header + Roots []cid.Cid + RootsPresent bool + BlockCount uint64 + CodecCounts map[multicodec.Code]uint64 + MhTypeCounts map[multicodec.Code]uint64 + AvgCidLength uint64 + MaxCidLength uint64 + MinCidLength uint64 + AvgBlockLength uint64 + MaxBlockLength uint64 + MinBlockLength uint64 + IndexCodec multicodec.Code +} + +// Inspect does a quick scan of a CAR, performing basic validation of the format +// and returning a Stats object that provides a high-level description of the +// contents of the CAR. +// Inspect works for CARv1 and CARv2 contents. A CARv1 will return an +// uninitialized Header value. +// +// If validateBlockHash is true, all block data in the payload will be hashed +// and compared to the CID for that block and an error will return if there +// is a mismatch. If false, block data will be skipped over and not checked. +// Performing a full block hash validation is similar to using a BlockReader and +// calling Next over all blocks. +// +// Inspect will perform a basic check of a CARv2 index, where present, but this +// does not guarantee that the index is correct. Attempting to read index data +// from untrusted sources is not recommended. If required, further validation of +// an index can be performed by loading the index and performing a ForEach() and +// sanity checking that the offsets are within the data payload section of the +// CAR. However, re-generation of index data in this case is the recommended +// course of action. +// +// Beyond the checks performed by Inspect, a valid / good CAR is somewhat +// use-case dependent. Factors to consider include: +// +// - Bad indexes, including incorrect offsets, duplicate entries, or other +// faulty data. Indexes should be re-generated, regardless, if you need to use +// them and have any reason to not trust the source. +// +// - Blocks use codecs that your system doesn't have access to—which may mean +// you can't traverse a DAG or use the contained data. Stats.CodecCounts +// contains a list of codecs found in the CAR so this can be checked. +// +// - CIDs use multihashes that your system doesn't have access to—which will +// mean you can't validate block hashes are correct (using validateBlockHash +// in this case will result in a failure). Stats.MhTypeCounts contains a +// list of multihashes found in the CAR so this can be checked. +// +// - The presence of IDENTITY CIDs, which may not be supported (or desired) by +// the consumer of the CAR. Stats.CodecCounts can determine the presence +// of IDENTITY CIDs. +// +// - Roots: the number of roots, duplicates, and whether they are related to the +// blocks contained within the CAR. Stats contains a list of Roots and a +// RootsPresent bool so further checks can be performed. +// +// - DAG completeness is not checked. Any properties relating to the DAG, or +// DAGs contained within a CAR are the responsibility of the user to check. +func (r *Reader) Inspect(validateBlockHash bool) (Stats, error) { + stats := Stats{ + Version: r.Version, + Header: r.Header, + CodecCounts: make(map[multicodec.Code]uint64), + MhTypeCounts: make(map[multicodec.Code]uint64), + } + + var totalCidLength uint64 + var totalBlockLength uint64 + var minCidLength uint64 = math.MaxUint64 + var minBlockLength uint64 = math.MaxUint64 + + dr, err := r.DataReader() + if err != nil { + return Stats{}, err + } + bdr := internalio.ToByteReader(dr) + + // read roots, not using Roots(), because we need the offset setup in the data trader + header, err := carv1.ReadHeader(dr, r.opts.MaxAllowedHeaderSize) + if err != nil { + return Stats{}, err + } + stats.Roots = header.Roots + var rootsPresentCount int + rootsPresent := make([]bool, len(stats.Roots)) + + // read block sections + for { + sectionLength, err := varint.ReadUvarint(bdr) + if err != nil { + if err == io.EOF { + // if the length of bytes read is non-zero when the error is EOF then signal an unclean EOF. + if sectionLength > 0 { + return Stats{}, io.ErrUnexpectedEOF + } + // otherwise, this is a normal ending + break + } + return Stats{}, err + } + if sectionLength == 0 && r.opts.ZeroLengthSectionAsEOF { + // normal ending for this read mode + break + } + if sectionLength > r.opts.MaxAllowedSectionSize { + return Stats{}, util.ErrSectionTooLarge + } + + // decode just the CID bytes + cidLen, c, err := cid.CidFromReader(dr) + if err != nil { + return Stats{}, err + } + + if sectionLength < uint64(cidLen) { + // this case is handled different in the normal ReadNode() path since it + // slurps in the whole section bytes and decodes CID from there - so an + // error should come from a failing io.ReadFull + return Stats{}, errors.New("section length shorter than CID length") + } + + // is this a root block? (also account for duplicate root CIDs) + if rootsPresentCount < len(stats.Roots) { + for i, r := range stats.Roots { + if !rootsPresent[i] && c == r { + rootsPresent[i] = true + rootsPresentCount++ + } + } + } + + cp := c.Prefix() + codec := multicodec.Code(cp.Codec) + count := stats.CodecCounts[codec] + stats.CodecCounts[codec] = count + 1 + mhtype := multicodec.Code(cp.MhType) + count = stats.MhTypeCounts[mhtype] + stats.MhTypeCounts[mhtype] = count + 1 + + blockLength := sectionLength - uint64(cidLen) + + if validateBlockHash { + // Use multihash.SumStream to avoid having to copy the entire block content into memory. + // The SumStream uses a buffered copy to write bytes into the hasher which will take + // advantage of streaming hash calculation depending on the hash function. + // TODO: introduce SumStream in go-cid to simplify the code here. + blockReader := io.LimitReader(dr, int64(blockLength)) + mhl := cp.MhLength + if mhtype == multicodec.Identity { + mhl = -1 + } + mh, err := multihash.SumStream(blockReader, cp.MhType, mhl) + if err != nil { + return Stats{}, err + } + var gotCid cid.Cid + switch cp.Version { + case 0: + gotCid = cid.NewCidV0(mh) + case 1: + gotCid = cid.NewCidV1(cp.Codec, mh) + default: + return Stats{}, fmt.Errorf("invalid cid version: %d", cp.Version) + } + if !gotCid.Equals(c) { + return Stats{}, fmt.Errorf("mismatch in content integrity, expected: %s, got: %s", c, gotCid) + } + } else { + // otherwise, skip over it + if _, err := dr.Seek(int64(blockLength), io.SeekCurrent); err != nil { + return Stats{}, err + } + } + + stats.BlockCount++ + totalCidLength += uint64(cidLen) + totalBlockLength += blockLength + if uint64(cidLen) < minCidLength { + minCidLength = uint64(cidLen) + } + if uint64(cidLen) > stats.MaxCidLength { + stats.MaxCidLength = uint64(cidLen) + } + if blockLength < minBlockLength { + minBlockLength = blockLength + } + if blockLength > stats.MaxBlockLength { + stats.MaxBlockLength = blockLength + } + } + + stats.RootsPresent = len(stats.Roots) == rootsPresentCount + if stats.BlockCount > 0 { + stats.MinCidLength = minCidLength + stats.MinBlockLength = minBlockLength + stats.AvgCidLength = totalCidLength / stats.BlockCount + stats.AvgBlockLength = totalBlockLength / stats.BlockCount + } + + if stats.Version != 1 && stats.Header.HasIndex() { + idxr, err := r.IndexReader() + if err != nil { + return Stats{}, err + } + stats.IndexCodec, err = index.ReadCodec(idxr) + if err != nil { + return Stats{}, err + } + } + + return stats, nil +} + +// Close closes the underlying reader if it was opened by OpenReader. +func (r *Reader) Close() error { + if r.closer != nil { + return r.closer.Close() + } + return nil +} + +// ReadVersion reads the version from the pragma. +// This function accepts both CARv1 and CARv2 payloads. +func ReadVersion(r io.Reader, opts ...Option) (uint64, error) { + o := ApplyOptions(opts...) + header, err := carv1.ReadHeader(r, o.MaxAllowedHeaderSize) + if err != nil { + return 0, err + } + return header.Version, nil +} diff --git a/ipld/car/v2/reader_test.go b/ipld/car/v2/reader_test.go new file mode 100644 index 0000000000..b398a870ac --- /dev/null +++ b/ipld/car/v2/reader_test.go @@ -0,0 +1,592 @@ +package car_test + +import ( + "bytes" + "encoding/hex" + "io" + "os" + "strings" + "testing" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/multiformats/go-multicodec" + "github.com/stretchr/testify/require" +) + +func TestReadVersion(t *testing.T) { + tests := []struct { + name string + path string + want uint64 + wantErr bool + }{ + { + name: "CarV1VersionIsOne", + path: "testdata/sample-v1.car", + want: 1, + }, + { + name: "CarV2VersionIsTwo", + path: "testdata/sample-rw-bs-v2.car", + want: 2, + }, + { + name: "CarV1VersionWithZeroLenSectionIsOne", + path: "testdata/sample-v1-with-zero-len-section.car", + want: 1, + }, + { + name: "AnotherCarV1VersionWithZeroLenSectionIsOne", + path: "testdata/sample-v1-with-zero-len-section2.car", + want: 1, + }, + { + name: "WrappedCarV1InCarV2VersionIsTwo", + path: "testdata/sample-wrapped-v2.car", + want: 2, + }, + { + name: "FutureVersionWithCorrectPragmaIsAsExpected", + path: "testdata/sample-rootless-v42.car", + want: 42, + }, + { + name: "CarV1WithValidHeaderButCorruptSectionIsOne", + path: "testdata/sample-v1-tailing-corrupt-section.car", + want: 1, + }, + { + name: "CarV2WithValidHeaderButCorruptSectionAndIndexIsTwo", + path: "testdata/sample-v2-corrupt-data-and-index.car", + want: 2, + }, + { + name: "CarFileWithCorruptPragmaIsError", + path: "testdata/sample-corrupt-pragma.car", + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + f, err := os.Open(tt.path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + + got, err := carv2.ReadVersion(f) + if tt.wantErr { + require.Error(t, err, "ReadVersion() error = %v, wantErr %v", err, tt.wantErr) + } else { + require.NoError(t, err) + require.Equal(t, tt.want, got, "ReadVersion() got = %v, want %v", got, tt.want) + } + }) + } +} + +func TestReaderFailsOnUnknownVersion(t *testing.T) { + _, err := carv2.OpenReader("testdata/sample-rootless-v42.car") + require.EqualError(t, err, "invalid car version: 42") +} + +func TestReaderFailsOnCorruptPragma(t *testing.T) { + _, err := carv2.OpenReader("testdata/sample-corrupt-pragma.car") + require.EqualError(t, err, "unexpected EOF") +} + +func TestReader_WithCarV1Consistency(t *testing.T) { + tests := []struct { + name string + path string + zerLenAsEOF bool + }{ + { + name: "CarV1WithoutZeroLengthSection", + path: "testdata/sample-v1.car", + }, + { + name: "CarV1WithZeroLenSection", + path: "testdata/sample-v1-with-zero-len-section.car", + zerLenAsEOF: true, + }, + { + name: "AnotherCarV1WithZeroLenSection", + path: "testdata/sample-v1-with-zero-len-section2.car", + zerLenAsEOF: true, + }, + { + name: "CarV1WithZeroLenSectionWithoutOption", + path: "testdata/sample-v1-with-zero-len-section.car", + }, + { + name: "AnotherCarV1WithZeroLenSectionWithoutOption", + path: "testdata/sample-v1-with-zero-len-section2.car", + }, + { + name: "CorruptCarV1", + path: "testdata/sample-v1-tailing-corrupt-section.car", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + subject, err := carv2.OpenReader(tt.path, carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, subject.Close()) }) + wantReader := requireNewCarV1ReaderFromV1File(t, tt.path, tt.zerLenAsEOF) + + require.Equal(t, uint64(1), subject.Version) + gotRoots, err := subject.Roots() + require.NoError(t, err) + require.Equal(t, wantReader.Header.Roots, gotRoots) + ir, err := subject.IndexReader() + require.Nil(t, ir) + require.NoError(t, err) + }) + } +} + +func TestReader_WithCarV2Consistency(t *testing.T) { + tests := []struct { + name string + path string + zerLenAsEOF bool + }{ + { + name: "CarV2WrappingV1", + path: "testdata/sample-wrapped-v2.car", + }, + { + name: "CarV2ProducedByBlockstore", + path: "testdata/sample-rw-bs-v2.car", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + subject, err := carv2.OpenReader(tt.path, carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, subject.Close()) }) + wantReader := requireNewCarV1ReaderFromV2File(t, tt.path, tt.zerLenAsEOF) + + require.Equal(t, uint64(2), subject.Version) + gotRoots, err := subject.Roots() + require.NoError(t, err) + require.Equal(t, wantReader.Header.Roots, gotRoots) + + gotIndexReader, err := subject.IndexReader() + require.NoError(t, err) + require.NotNil(t, gotIndexReader) + gotIndex, err := index.ReadFrom(gotIndexReader) + require.NoError(t, err) + dr, err := subject.DataReader() + require.NoError(t, err) + wantIndex, err := carv2.GenerateIndex(dr) + require.NoError(t, err) + require.Equal(t, wantIndex, gotIndex) + }) + } +} + +func TestOpenReader_DoesNotPanicForReadersCreatedBeforeClosure(t *testing.T) { + subject, err := carv2.OpenReader("testdata/sample-wrapped-v2.car") + require.NoError(t, err) + dReaderBeforeClosure, err := subject.DataReader() + require.NoError(t, err) + iReaderBeforeClosure, err := subject.IndexReader() + require.NoError(t, err) + require.NoError(t, subject.Close()) + + buf := make([]byte, 1) + panicTest := func(r io.Reader) { + _, err := r.Read(buf) + require.EqualError(t, err, "mmap: closed") + } + + require.NotPanics(t, func() { panicTest(dReaderBeforeClosure) }) + require.NotPanics(t, func() { panicTest(iReaderBeforeClosure) }) +} + +func TestOpenReader_DoesNotPanicForReadersCreatedAfterClosure(t *testing.T) { + subject, err := carv2.OpenReader("testdata/sample-wrapped-v2.car") + require.NoError(t, err) + require.NoError(t, subject.Close()) + dReaderAfterClosure, err := subject.DataReader() + require.NoError(t, err) + iReaderAfterClosure, err := subject.IndexReader() + require.NoError(t, err) + + buf := make([]byte, 1) + panicTest := func(r io.Reader) { + _, err := r.Read(buf) + require.EqualError(t, err, "mmap: closed") + } + + require.NotPanics(t, func() { panicTest(dReaderAfterClosure) }) + require.NotPanics(t, func() { panicTest(iReaderAfterClosure) }) +} + +func TestReader_ReturnsNilWhenThereIsNoIndex(t *testing.T) { + tests := []struct { + name string + path string + }{ + { + name: "IndexlessCarV2", + path: "testdata/sample-v2-indexless.car", + }, + { + name: "CarV1", + path: "testdata/sample-v1.car", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + subject, err := carv2.OpenReader(tt.path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, subject.Close()) }) + ir, err := subject.IndexReader() + require.NoError(t, err) + require.Nil(t, ir) + }) + } +} + +func requireNewCarV1ReaderFromV2File(t *testing.T, carV12Path string, zerLenAsEOF bool) *carv1.CarReader { + f, err := os.Open(carV12Path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + + _, err = f.Seek(carv2.PragmaSize, io.SeekStart) + require.NoError(t, err) + header := carv2.Header{} + _, err = header.ReadFrom(f) + require.NoError(t, err) + return requireNewCarV1Reader(t, io.NewSectionReader(f, int64(header.DataOffset), int64(header.DataSize)), zerLenAsEOF) +} + +func requireNewCarV1ReaderFromV1File(t *testing.T, carV1Path string, zerLenAsEOF bool) *carv1.CarReader { + f, err := os.Open(carV1Path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + return requireNewCarV1Reader(t, f, zerLenAsEOF) +} + +func requireNewCarV1Reader(t *testing.T, r io.Reader, zerLenAsEOF bool) *carv1.CarReader { + var cr *carv1.CarReader + var err error + if zerLenAsEOF { + cr, err = carv1.NewCarReaderWithZeroLengthSectionAsEOF(r) + } else { + cr, err = carv1.NewCarReader(r) + } + require.NoError(t, err) + return cr +} + +func TestInspect(t *testing.T) { + tests := []struct { + name string + path string + carHex string + zerLenAsEOF bool + expectedStats carv2.Stats + }{ + { + name: "IndexlessCarV2", + path: "testdata/sample-v2-indexless.car", + expectedStats: carv2.Stats{ + Version: 2, + Header: carv2.Header{ + Characteristics: carv2.Characteristics{0, 0}, + DataOffset: 51, + DataSize: 479907, + IndexOffset: 0, + }, + Roots: []cid.Cid{mustCidDecode("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy")}, + RootsPresent: true, + AvgBlockLength: 417, // 417.6644423260248 + MinBlockLength: 1, + MaxBlockLength: 1342, + AvgCidLength: 37, // 37.86939942802669 + MinCidLength: 14, + MaxCidLength: 38, + BlockCount: 1049, + CodecCounts: map[multicodec.Code]uint64{ + multicodec.Raw: 6, + multicodec.DagCbor: 1043, + }, + MhTypeCounts: map[multicodec.Code]uint64{ + multicodec.Identity: 6, + multicodec.Blake2b256: 1043, + }, + }, + }, + { + // same payload as IndexlessCarV2, so only difference is the Version & Header + name: "CarV1", + path: "testdata/sample-v1.car", + expectedStats: carv2.Stats{ + Version: 1, + Header: carv2.Header{}, + Roots: []cid.Cid{mustCidDecode("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy")}, + RootsPresent: true, + AvgBlockLength: 417, // 417.6644423260248 + MinBlockLength: 1, + MaxBlockLength: 1342, + AvgCidLength: 37, // 37.86939942802669 + MinCidLength: 14, + MaxCidLength: 38, + BlockCount: 1049, + CodecCounts: map[multicodec.Code]uint64{ + multicodec.Raw: 6, + multicodec.DagCbor: 1043, + }, + MhTypeCounts: map[multicodec.Code]uint64{ + multicodec.Identity: 6, + multicodec.Blake2b256: 1043, + }, + }, + }, + { + // same payload as IndexlessCarV2, so only difference is the Header + name: "CarV2ProducedByBlockstore", + path: "testdata/sample-rw-bs-v2.car", + expectedStats: carv2.Stats{ + Version: 2, + Header: carv2.Header{ + DataOffset: 1464, + DataSize: 273, + IndexOffset: 1737, + }, + Roots: []cid.Cid{ + mustCidDecode("bafkreifuosuzujyf4i6psbneqtwg2fhplc2wxptc5euspa2gn3bwhnihfu"), + mustCidDecode("bafkreifc4hca3inognou377hfhvu2xfchn2ltzi7yu27jkaeujqqqdbjju"), + mustCidDecode("bafkreig5lvr4l6b4fr3un4xvzeyt3scevgsqjgrhlnwxw2unwbn5ro276u"), + }, + RootsPresent: true, + BlockCount: 3, + CodecCounts: map[multicodec.Code]uint64{multicodec.Raw: 3}, + MhTypeCounts: map[multicodec.Code]uint64{multicodec.Sha2_256: 3}, + AvgCidLength: 36, + MaxCidLength: 36, + MinCidLength: 36, + AvgBlockLength: 6, + MaxBlockLength: 9, + MinBlockLength: 4, + IndexCodec: multicodec.CarMultihashIndexSorted, + }, + }, + // same as CarV1 but with a zero-byte EOF to test options + { + name: "CarV1VersionWithZeroLenSectionIsOne", + path: "testdata/sample-v1-with-zero-len-section.car", + zerLenAsEOF: true, + expectedStats: carv2.Stats{ + Version: 1, + Header: carv2.Header{}, + Roots: []cid.Cid{mustCidDecode("bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oy")}, + RootsPresent: true, + AvgBlockLength: 417, // 417.6644423260248 + MinBlockLength: 1, + MaxBlockLength: 1342, + AvgCidLength: 37, // 37.86939942802669 + MinCidLength: 14, + MaxCidLength: 38, + BlockCount: 1049, + CodecCounts: map[multicodec.Code]uint64{ + multicodec.Raw: 6, + multicodec.DagCbor: 1043, + }, + MhTypeCounts: map[multicodec.Code]uint64{ + multicodec.Identity: 6, + multicodec.Blake2b256: 1043, + }, + }, + }, + { + // A case where this _could_ be a valid CAR if we allowed identity CIDs + // and not matching block contents to exist, there's no block bytes in + // this. It will only fail if you don't validate the CID matches the, + // bytes (see TestInspectError for that case). + name: "IdentityCID", + // 47 {version:1,roots:[identity cid]} 25 identity cid (dag-json {"identity":"block"}) + carHex: "2f a265726f6f747381d82a581a0001a90200147b226964656e74697479223a22626c6f636b227d6776657273696f6e01 19 01a90200147b226964656e74697479223a22626c6f636b227d", + expectedStats: carv2.Stats{ + Version: 1, + Roots: []cid.Cid{mustCidDecode("baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5")}, + RootsPresent: true, + BlockCount: 1, + CodecCounts: map[multicodec.Code]uint64{multicodec.DagJson: 1}, + MhTypeCounts: map[multicodec.Code]uint64{multicodec.Identity: 1}, + AvgCidLength: 25, + MaxCidLength: 25, + MinCidLength: 25, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var reader *carv2.Reader + var err error + if tt.path != "" { + reader, err = carv2.OpenReader(tt.path, carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + } else { + byts, err := hex.DecodeString(strings.ReplaceAll(tt.carHex, " ", "")) + require.NoError(t, err) + reader, err = carv2.NewReader(bytes.NewReader(byts), carv2.ZeroLengthSectionAsEOF(tt.zerLenAsEOF)) + require.NoError(t, err) + } + t.Cleanup(func() { require.NoError(t, reader.Close()) }) + stats, err := reader.Inspect(false) + require.NoError(t, err) + require.Equal(t, tt.expectedStats, stats) + }) + } +} + +func TestInspectError(t *testing.T) { + tests := []struct { + name string + carHex string + expectedOpenError string + expectedInspectError string + validateBlockHash bool + }{ + { + name: "BadCidV0", + carHex: "3aa265726f6f747381d8305825000130302030303030303030303030303030303030303030303030303030303030303030306776657273696f6e010130", + expectedInspectError: "expected 1 as the cid version number, got: 48", + }, + { + name: "BadHeaderLength", + carHex: "e0e0e0e0a7060c6f6c4cca943c236f4b196723489608edb42a8b8fa80b6776657273696f6e19", + expectedOpenError: "invalid header data, length of read beyond allowable maximum", + }, + { + name: "BadSectionLength", + carHex: "11a265726f6f7473806776657273696f6e01e0e0e0e0a7060155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000", + expectedInspectError: "invalid section data, length of read beyond allowable maximum", + }, + { + name: "BadSectionLength2", + carHex: "3aa265726f6f747381d8305825000130302030303030303030303030303030303030303030303030303030303030303030306776657273696f6e01200130302030303030303030303030303030303030303030303030303030303030303030303030303030303030", + expectedInspectError: "section length shorter than CID length", + validateBlockHash: true, + }, + { + name: "BadSectionLength3", + carHex: "11a265726f6f7473f66776657273696f6e0180", + expectedInspectError: "unexpected EOF", + }, + { + name: "BadBlockHash(SanityCheck)", // this should pass because we don't ask the CID be validated even though it doesn't match + // header cid data + carHex: "11a265726f6f7473806776657273696f6e 012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca ffffffffffffffffffff", + }, + { + name: "BadBlockHash", // same as above, but we ask for CID validation + // header cid data + carHex: "11a265726f6f7473806776657273696f6e 012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca ffffffffffffffffffff", + validateBlockHash: true, + expectedInspectError: "mismatch in content integrity, expected: bafkreiab2rek7wjiazkfrt3hbnqpljmu24226alszdlh6ivic2abgjubzi, got: bafkreiaaqoxrddiyuy6gxnks6ioqytxhq5a7tchm2mm5htigznwiljukmm", + }, + { + name: "IdentityCID", // a case where this _could_ be a valid CAR if we allowed identity CIDs and not matching block contents to exist, there's no block bytes in this + // 47 {version:1,roots:[identity cid]} 25 identity cid (dag-json {"identity":"block"}) + carHex: "2f a265726f6f747381d82a581a0001a90200147b226964656e74697479223a22626c6f636b227d6776657273696f6e01 19 01a90200147b226964656e74697479223a22626c6f636b227d", + validateBlockHash: true, + expectedInspectError: "mismatch in content integrity, expected: baguqeaaupmrgszdfnz2gs5dzei5ceytmn5rwwit5, got: baguqeaaa", + }, + // the bad index tests are manually constructed from this single-block CARv2 by adjusting the Uint32 and Uint64 values in the index: + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + // 0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 01000000 1200000000000000 01000000 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000 + // we don't test any further into the index, to do that, a user should do a ForEach across the loaded index (and sanity check the offsets) + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + car, _ := hex.DecodeString(strings.ReplaceAll(tt.carHex, " ", "")) + reader, err := carv2.NewReader(bytes.NewReader(car)) + if tt.expectedOpenError != "" { + require.Error(t, err) + require.Equal(t, tt.expectedOpenError, err.Error()) + return + } else { + require.NoError(t, err) + } + t.Cleanup(func() { require.NoError(t, reader.Close()) }) + _, err = reader.Inspect(tt.validateBlockHash) + if tt.expectedInspectError != "" { + require.Error(t, err) + require.Equal(t, tt.expectedInspectError, err.Error()) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestIndex_ReadFromCorruptIndex(t *testing.T) { + tests := []struct { + name string + givenCarHex string + wantErr string + }{ + { + name: "BadIndexCountOverflow", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + givenCarHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 ffffffff 1200000000000000 01000000 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + wantErr: "index too big; MultihashIndexSorted count is overflowing int32", + }, + { + name: "BadIndexCountTooMany", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + givenCarHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 ffffff7f 1200000000000000 01000000 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + wantErr: "unexpected EOF", + }, + { + name: "BadIndexMultiWidthOverflow", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + givenCarHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 01000000 1200000000000000 ffffffff 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + wantErr: "index too big; multiWidthIndex count is overflowing int32", + }, + { + name: "BadIndexMultiWidthTooMany", + // pragma carv2 header carv1 icodec count codec count (swi) width dataLen mh offset + givenCarHex: "0aa16776657273696f6e02 00000000000000000000000000000000330000000000000041000000000000007400000000000000 11a265726f6f7473806776657273696f6e012e0155122001d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca00000000000000000000 8108 01000000 1200000000000000 ffffff7f 28000000 2800000000000000 01d448afd928065458cf670b60f5a594d735af0172c8d67f22a81680132681ca 1200000000000000", + wantErr: "unexpected EOF", + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + car, _ := hex.DecodeString(strings.ReplaceAll(test.givenCarHex, " ", "")) + reader, err := carv2.NewReader(bytes.NewReader(car)) + require.NoError(t, err) + + ir, err := reader.IndexReader() + require.NoError(t, err) + require.NotNil(t, ir) + + gotIdx, err := index.ReadFrom(ir) + if test.wantErr == "" { + require.NoError(t, err) + require.NotNil(t, gotIdx) + } else { + require.Error(t, err) + require.Equal(t, test.wantErr, err.Error()) + require.Nil(t, gotIdx) + } + }) + } +} + +func mustCidDecode(s string) cid.Cid { + c, err := cid.Decode(s) + if err != nil { + panic(err) + } + return c +} diff --git a/ipld/car/v2/selective.go b/ipld/car/v2/selective.go new file mode 100644 index 0000000000..5aaac55940 --- /dev/null +++ b/ipld/car/v2/selective.go @@ -0,0 +1,289 @@ +package car + +import ( + "context" + "fmt" + "io" + "math" + "os" + + "github.com/ipfs/go-cid" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/loader" + ipld "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/datamodel" + "github.com/ipld/go-ipld-prime/linking" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/ipld/go-ipld-prime/node/basicnode" + "github.com/ipld/go-ipld-prime/traversal" + "github.com/ipld/go-ipld-prime/traversal/selector" +) + +// ErrSizeMismatch is returned when a written traversal realizes the written header size does not +// match the actual number of car bytes written. +var ErrSizeMismatch = fmt.Errorf("car-error-sizemismatch") + +// ErrOffsetImpossible is returned when specified paddings or offsets of either a wrapped carv1 +// or index cannot be satisfied based on the data being written. +var ErrOffsetImpossible = fmt.Errorf("car-error-offsetimpossible") + +// MaxTraversalLinks changes the allowed number of links a selector traversal +// can execute before failing. +// +// Note that setting this option may cause an error to be returned from selector +// execution when building a SelectiveCar. +func MaxTraversalLinks(MaxTraversalLinks uint64) Option { + return func(sco *Options) { + sco.MaxTraversalLinks = MaxTraversalLinks + } +} + +// NewSelectiveWriter walks through the proposed dag traversal to learn its total size in order to be able to +// stream out a car to a writer in the expected traversal order in one go. +func NewSelectiveWriter(ctx context.Context, ls *ipld.LinkSystem, root cid.Cid, selector ipld.Node, opts ...Option) (Writer, error) { + cls, cntr := loader.CountingLinkSystem(*ls) + + c1h := carv1.CarHeader{Roots: []cid.Cid{root}, Version: 1} + headSize, err := carv1.HeaderSize(&c1h) + if err != nil { + return nil, err + } + if err := traverse(ctx, &cls, root, selector, ApplyOptions(opts...)); err != nil { + return nil, err + } + tc := traversalCar{ + size: headSize + cntr.Size(), + ctx: ctx, + root: root, + selector: selector, + ls: ls, + opts: ApplyOptions(opts...), + } + return &tc, nil +} + +// TraverseToFile writes a car file matching a given root and selector to the +// path at `destination` using one read of each block. +func TraverseToFile(ctx context.Context, ls *ipld.LinkSystem, root cid.Cid, selector ipld.Node, destination string, opts ...Option) error { + tc := traversalCar{ + size: 0, + ctx: ctx, + root: root, + selector: selector, + ls: ls, + opts: ApplyOptions(opts...), + } + + fp, err := os.Create(destination) + if err != nil { + return err + } + defer fp.Close() + + _, err = tc.WriteTo(fp) + if err != nil { + return err + } + + // fix header size. + if _, err = fp.Seek(0, 0); err != nil { + return err + } + + tc.size = uint64(tc.size) + if _, err = tc.WriteV2Header(fp); err != nil { + return err + } + + return nil +} + +// TraverseV1 walks through the proposed dag traversal and writes a carv1 to the provided io.Writer +func TraverseV1(ctx context.Context, ls *ipld.LinkSystem, root cid.Cid, selector ipld.Node, writer io.Writer, opts ...Option) (uint64, error) { + opts = append(opts, WithoutIndex()) + tc := traversalCar{ + size: 0, + ctx: ctx, + root: root, + selector: selector, + ls: ls, + opts: ApplyOptions(opts...), + } + + len, _, err := tc.WriteV1(writer) + return len, err +} + +// Writer is an interface allowing writing a car prepared by PrepareTraversal +type Writer interface { + io.WriterTo +} + +var _ Writer = (*traversalCar)(nil) + +type traversalCar struct { + size uint64 + ctx context.Context + root cid.Cid + selector ipld.Node + ls *ipld.LinkSystem + opts Options +} + +func (tc *traversalCar) WriteTo(w io.Writer) (int64, error) { + n, err := tc.WriteV2Header(w) + if err != nil { + return n, err + } + v1s, idx, err := tc.WriteV1(w) + n += int64(v1s) + + if err != nil { + return n, err + } + + // index padding, then index + if tc.opts.IndexCodec != index.CarIndexNone { + if tc.opts.IndexPadding > 0 { + buf := make([]byte, tc.opts.IndexPadding) + pn, err := w.Write(buf) + n += int64(pn) + if err != nil { + return n, err + } + } + in, err := index.WriteTo(idx, w) + n += int64(in) + if err != nil { + return n, err + } + } + + return n, err +} + +func (tc *traversalCar) WriteV2Header(w io.Writer) (int64, error) { + n, err := w.Write(Pragma) + if err != nil { + return int64(n), err + } + + h := NewHeader(tc.size) + if p := tc.opts.DataPadding; p > 0 { + h = h.WithDataPadding(p) + } + if p := tc.opts.IndexPadding; p > 0 { + h = h.WithIndexPadding(p) + } + if tc.opts.IndexCodec == index.CarIndexNone { + h.IndexOffset = 0 + } + hn, err := h.WriteTo(w) + if err != nil { + return int64(n) + hn, err + } + hn += int64(n) + + // We include the initial data padding after the carv2 header + if h.DataOffset > uint64(hn) { + // TODO: buffer writes if this needs to be big. + buf := make([]byte, h.DataOffset-uint64(hn)) + n, err = w.Write(buf) + hn += int64(n) + if err != nil { + return hn, err + } + } else if h.DataOffset < uint64(hn) { + return hn, ErrOffsetImpossible + } + + return hn, nil +} + +func (tc *traversalCar) WriteV1(w io.Writer) (uint64, index.Index, error) { + // write the v1 header + c1h := carv1.CarHeader{Roots: []cid.Cid{tc.root}, Version: 1} + if err := carv1.WriteHeader(&c1h, w); err != nil { + return 0, nil, err + } + v1Size, err := carv1.HeaderSize(&c1h) + if err != nil { + return v1Size, nil, err + } + + // write the block. + wls, writer := loader.TeeingLinkSystem(*tc.ls, w, v1Size, tc.opts.IndexCodec) + err = traverse(tc.ctx, &wls, tc.root, tc.selector, tc.opts) + v1Size = writer.Size() + if err != nil { + return v1Size, nil, err + } + if tc.size != 0 && tc.size != v1Size { + return v1Size, nil, ErrSizeMismatch + } + tc.size = v1Size + + if tc.opts.IndexCodec == index.CarIndexNone { + return v1Size, nil, nil + } + idx, err := writer.Index() + return v1Size, idx, err +} + +func traverse(ctx context.Context, ls *ipld.LinkSystem, root cid.Cid, s ipld.Node, opts Options) error { + sel, err := selector.CompileSelector(s) + if err != nil { + return err + } + + chooser := func(_ ipld.Link, _ linking.LinkContext) (ipld.NodePrototype, error) { + return basicnode.Prototype.Any, nil + } + if opts.TraversalPrototypeChooser != nil { + chooser = opts.TraversalPrototypeChooser + } + + progress := traversal.Progress{ + Cfg: &traversal.Config{ + Ctx: ctx, + LinkSystem: *ls, + LinkTargetNodePrototypeChooser: chooser, + LinkVisitOnlyOnce: !opts.BlockstoreAllowDuplicatePuts, + }, + } + if opts.MaxTraversalLinks < math.MaxInt64 { + progress.Budget = &traversal.Budget{ + NodeBudget: math.MaxInt64, + LinkBudget: int64(opts.MaxTraversalLinks), + } + } + + lnk := cidlink.Link{Cid: root} + ls.TrustedStorage = true + rp, err := chooser(lnk, ipld.LinkContext{}) + if err != nil { + return err + } + rootNode, err := ls.Load(ipld.LinkContext{}, lnk, rp) + if err != nil { + return fmt.Errorf("root blk load failed: %s", err) + } + err = progress.WalkMatching(rootNode, sel, func(_ traversal.Progress, node ipld.Node) error { + if lbn, ok := node.(datamodel.LargeBytesNode); ok { + s, err := lbn.AsLargeBytes() + if err != nil { + return err + } + _, err = io.Copy(io.Discard, s) + if err != nil { + return err + } + } + return nil + }) + if err != nil { + return fmt.Errorf("walk failed: %s", err) + } + return nil +} diff --git a/ipld/car/v2/selective_test.go b/ipld/car/v2/selective_test.go new file mode 100644 index 0000000000..9693309281 --- /dev/null +++ b/ipld/car/v2/selective_test.go @@ -0,0 +1,137 @@ +package car_test + +import ( + "bytes" + "context" + "io" + "os" + "path" + "testing" + + "github.com/ipfs/go-cid" + blocks "github.com/ipfs/go-libipfs/blocks" + "github.com/ipfs/go-unixfsnode" + "github.com/ipfs/go-unixfsnode/data/builder" + "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/blockstore" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime/datamodel" + "github.com/ipld/go-ipld-prime/linking" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + basicnode "github.com/ipld/go-ipld-prime/node/basic" + "github.com/ipld/go-ipld-prime/storage/bsadapter" + sb "github.com/ipld/go-ipld-prime/traversal/selector/builder" + selectorparse "github.com/ipld/go-ipld-prime/traversal/selector/parse" + "github.com/stretchr/testify/require" + + _ "github.com/ipld/go-ipld-prime/codec/dagcbor" + _ "github.com/ipld/go-ipld-prime/codec/raw" +) + +func TestPrepareTraversal(t *testing.T) { + from, err := blockstore.OpenReadOnly("testdata/sample-unixfs-v2.car") + require.NoError(t, err) + ls := cidlink.DefaultLinkSystem() + bsa := bsadapter.Adapter{Wrapped: from} + ls.SetReadStorage(&bsa) + + rts, _ := from.Roots() + writer, err := car.NewSelectiveWriter(context.Background(), &ls, rts[0], selectorparse.CommonSelector_ExploreAllRecursively) + require.NoError(t, err) + + buf := bytes.Buffer{} + n, err := writer.WriteTo(&buf) + require.NoError(t, err) + require.Equal(t, int64(len(buf.Bytes())), n) + + fi, _ := os.Stat("testdata/sample-unixfs-v2.car") + require.Equal(t, fi.Size(), n) + + // Headers should be equal + h1, _ := car.OpenReader("testdata/sample-unixfs-v2.car") + h1h := bytes.Buffer{} + h1h.Write(car.Pragma) + h1.Header.WriteTo(&h1h) + require.Equal(t, buf.Bytes()[:h1h.Len()], h1h.Bytes()) +} + +func TestFileTraversal(t *testing.T) { + from, err := blockstore.OpenReadOnly("testdata/sample-unixfs-v2.car") + require.NoError(t, err) + ls := cidlink.DefaultLinkSystem() + bsa := bsadapter.Adapter{Wrapped: from} + ls.SetReadStorage(&bsa) + + rts, _ := from.Roots() + outDir := t.TempDir() + err = car.TraverseToFile(context.Background(), &ls, rts[0], selectorparse.CommonSelector_ExploreAllRecursively, path.Join(outDir, "out.car")) + require.NoError(t, err) + + require.FileExists(t, path.Join(outDir, "out.car")) + + fa, _ := os.Stat("testdata/sample-unixfs-v2.car") + fb, _ := os.Stat(path.Join(outDir, "out.car")) + require.Equal(t, fa.Size(), fb.Size()) +} + +func TestV1Traversal(t *testing.T) { + from, err := blockstore.OpenReadOnly("testdata/sample-v1.car") + require.NoError(t, err) + ls := cidlink.DefaultLinkSystem() + bsa := bsadapter.Adapter{Wrapped: from} + ls.SetReadStorage(&bsa) + + rts, _ := from.Roots() + w := bytes.NewBuffer(nil) + n, err := car.TraverseV1(context.Background(), &ls, rts[0], selectorparse.CommonSelector_ExploreAllRecursively, w) + require.NoError(t, err) + require.Equal(t, int64(len(w.Bytes())), int64(n)) + + fa, _ := os.Stat("testdata/sample-v1.car") + require.Equal(t, fa.Size(), int64(n)) +} + +func TestPartialTraversal(t *testing.T) { + store := cidlink.Memory{Bag: make(map[string][]byte)} + ls := cidlink.DefaultLinkSystem() + ls.StorageReadOpener = store.OpenRead + ls.StorageWriteOpener = store.OpenWrite + unixfsnode.AddUnixFSReificationToLinkSystem(&ls) + + // write a unixfs file. + initBuf := bytes.Buffer{} + _, _ = initBuf.Write(make([]byte, 1000000)) + rt, _, err := builder.BuildUnixFSFile(&initBuf, "", &ls) + require.NoError(t, err) + + // read a subset of the file. + _, rts, err := cid.CidFromBytes([]byte(rt.Binary())) + require.NoError(t, err) + ssb := sb.NewSelectorSpecBuilder(basicnode.Prototype.Any) + sel := ssb.ExploreInterpretAs("unixfs", ssb.MatcherSubset(0, 256*1000)) + buf := bytes.Buffer{} + chooser := dagpb.AddSupportToChooser(func(l datamodel.Link, lc linking.LinkContext) (datamodel.NodePrototype, error) { + return basicnode.Prototype.Any, nil + }) + _, err = car.TraverseV1(context.Background(), &ls, rts, sel.Node(), &buf, car.WithTraversalPrototypeChooser(chooser)) + require.NoError(t, err) + + fb := len(buf.Bytes()) + require.Less(t, fb, 1000000) + + loaded, err := car.NewBlockReader(&buf) + require.NoError(t, err) + fnd := make(map[cid.Cid]struct{}) + var b blocks.Block + for err == nil { + b, err = loaded.Next() + if err == io.EOF { + break + } + if _, ok := fnd[b.Cid()]; ok { + require.Fail(t, "duplicate block present", b.Cid()) + } + fnd[b.Cid()] = struct{}{} + } + require.Equal(t, 2, len(fnd)) +} diff --git a/ipld/car/v2/storage/doc.go b/ipld/car/v2/storage/doc.go new file mode 100644 index 0000000000..21e049b21b --- /dev/null +++ b/ipld/car/v2/storage/doc.go @@ -0,0 +1,75 @@ +// Package storage provides a CAR abstraction for the +// github.com/ipld/go-ipld-prime/storage interfaces in the form of a StorageCar. +// +// THIS PACKAGE IS EXPERIMENTAL. Breaking changes may be introduced in +// semver-minor releases before this package stabilizes. Use with caution and +// prefer the blockstore API if stability is required. +// +// StorageCar as ReadableStorage provides basic Get and Has operations. It also +// implements StreamingReadableStorage for the more efficient GetStreaming +// operation which is easily supported by the CAR format. +// +// StorageCar as WritableStorage provides the Put operation. It does not +// implement StreamingWritableStorage because the CAR format requires CIDs to +// be written before the blocks themselves, which is not possible with +// StreamingWritableStorage without buffering. Therefore, the PutStream function +// in github.com/ipld/go-ipld-prime/storage will provide equivalent +// functionality if it were to be implemented here. +// +// StorageCar can be used with an IPLD LinkSystem, defined by +// github.com/ipld/go-ipld-prime/linking, with the +// linking.SetReadStorage and linking.SetWriteStorage functions, to provide +// read and/or write to and/or from a CAR format as required. +// +// The focus of the StorageCar interfaces is to use the minimal possible IO +// interface for the operation(s) being performed. +// +// • OpenReadable requires an io.ReaderAt as seeking is required for +// random-access reads as a ReadableStore. +// +// • NewWritable requires an io.Writer when used to write a CARv1 as this format +// can be written in a continuous stream as blocks are written through a +// WritableStore (i.e. when the WriteAsCarV1 option is turned on). When used to +// write a CARv2, the default mode, a random-access io.WriterAt is required as +// the CARv2 header must be written after the payload is finalized and index +// written in order to indicate payload location in the output. The plain +// Writable store may be used to stream CARv1 contents without buffering; +// only storing CIDs in memory for de-duplication (where required) and to still +// allow Has operations. +// +// • NewReadableWritable requires an io.ReaderAt and an io.Writer as it combines +// the functionality of a NewWritable with OpenReadable, being able to random- +// access read any written blocks. +// +// • OpenReadableWritable requires an io.ReaderAt, an io.Writer and an +// io.WriterAt as it extends the NewReadableWritable functionality with the +// ability to resume an existing CAR. In addition, if the CAR being resumed is +// a CARv2, the IO object being provided must have a Truncate() method (e.g. +// an io.File) in order to properly manage CAR lifecycle and avoid writing a +// corrupt CAR. +// +// The following options are available to customize the behavior of the +// StorageCar: +// +// • WriteAsCarV1 +// +// • StoreIdentityCIDs +// +// • AllowDuplicatePuts +// +// • UseWholeCIDs +// +// • ZeroLengthSectionAsEOF +// +// • UseIndexCodec +// +// • UseDataPadding +// +// • UseIndexPadding +// +// • MaxIndexCidSize +// +// • MaxAllowedHeaderSize +// +// • MaxAllowedSectionSize +package storage diff --git a/ipld/car/v2/storage/notfound.go b/ipld/car/v2/storage/notfound.go new file mode 100644 index 0000000000..82153e2f2e --- /dev/null +++ b/ipld/car/v2/storage/notfound.go @@ -0,0 +1,39 @@ +package storage + +import "github.com/ipfs/go-cid" + +// compatible with the go-ipld-format ErrNotFound, match against +// interface{NotFound() bool} +// this could go into go-ipld-prime, but for now we'll just exercise the +// feature-test pattern + +type ErrNotFound struct { + Cid cid.Cid +} + +func (e ErrNotFound) Error() string { + if e.Cid == cid.Undef { + return "ipld: could not find node" + } + return "ipld: could not find " + e.Cid.String() +} + +func (e ErrNotFound) NotFound() bool { + return true +} + +func (e ErrNotFound) Is(err error) bool { + switch err.(type) { + case ErrNotFound: + return true + default: + return false + } +} + +func IsNotFound(err error) bool { + if nf, ok := err.(interface{ NotFound() bool }); ok { + return nf.NotFound() + } + return false +} diff --git a/ipld/car/v2/storage/storage.go b/ipld/car/v2/storage/storage.go new file mode 100644 index 0000000000..1a8b499b9f --- /dev/null +++ b/ipld/car/v2/storage/storage.go @@ -0,0 +1,505 @@ +package storage + +import ( + "bytes" + "context" + "errors" + "fmt" + "io" + "sync" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/internal/carv1/util" + internalio "github.com/ipld/go-car/v2/internal/io" + "github.com/ipld/go-car/v2/internal/store" + ipldstorage "github.com/ipld/go-ipld-prime/storage" +) + +var errClosed = errors.New("cannot use a CARv2 storage after closing") + +type ReaderAtWriterAt interface { + io.ReaderAt + io.Writer + io.WriterAt +} + +type ReadableCar interface { + ipldstorage.ReadableStorage + ipldstorage.StreamingReadableStorage + Roots() []cid.Cid +} + +// WritableCar is compatible with storage.WritableStorage but also returns +// the roots of the CAR. It does not implement ipld.StreamingWritableStorage +// as the CAR format does not support streaming data followed by its CID, so +// any streaming implementation would perform buffering and copy the +// existing storage.PutStream() implementation. +type WritableCar interface { + ipldstorage.WritableStorage + Roots() []cid.Cid + Finalize() error +} + +var _ ReadableCar = (*StorageCar)(nil) +var _ WritableCar = (*StorageCar)(nil) + +type StorageCar struct { + idx index.Index + reader io.ReaderAt + writer positionedWriter + dataWriter *internalio.OffsetWriteSeeker + header carv2.Header + roots []cid.Cid + opts carv2.Options + + closed bool + mu sync.RWMutex +} + +type positionedWriter interface { + io.Writer + Position() int64 +} + +// OpenReadable opens a CARv1 or CARv2 file for reading as a ReadableStorage +// and StreamingReadableStorage as defined by +// github.com/ipld/go-ipld-prime/storage. +// +// The returned ReadableStorage is compatible with a linksystem SetReadStorage +// method as defined by github.com/ipld/go-ipld-prime/linking +// to provide a block source backed by a CAR. +// +// When opening a CAR, an initial scan is performed to generate an index, or +// load an index from a CARv2 index where available. This index data is kept in +// memory while the CAR is being used in order to provide efficient random +// Get access to blocks and Has operations. +// +// The Readable supports StreamingReadableStorage, which allows for efficient +// GetStreaming operations straight out of the underlying CAR where the +// linksystem can make use of it. +func OpenReadable(reader io.ReaderAt, opts ...carv2.Option) (ReadableCar, error) { + sc := &StorageCar{opts: carv2.ApplyOptions(opts...)} + + rr := internalio.ToReadSeeker(reader) + header, err := carv1.ReadHeader(rr, sc.opts.MaxAllowedHeaderSize) + if err != nil { + return nil, err + } + switch header.Version { + case 1: + sc.roots = header.Roots + sc.reader = reader + rr.Seek(0, io.SeekStart) + sc.idx = store.NewInsertionIndex() + if err := carv2.LoadIndex(sc.idx, rr, opts...); err != nil { + return nil, err + } + case 2: + v2r, err := carv2.NewReader(reader, opts...) + if err != nil { + return nil, err + } + sc.roots, err = v2r.Roots() + if err != nil { + return nil, err + } + if v2r.Header.HasIndex() { + ir, err := v2r.IndexReader() + if err != nil { + return nil, err + } + sc.idx, err = index.ReadFrom(ir) + if err != nil { + return nil, err + } + } else { + dr, err := v2r.DataReader() + if err != nil { + return nil, err + } + sc.idx = store.NewInsertionIndex() + if err := carv2.LoadIndex(sc.idx, dr, opts...); err != nil { + return nil, err + } + } + if sc.reader, err = v2r.DataReader(); err != nil { + return nil, err + } + default: + return nil, fmt.Errorf("unsupported CAR version: %v", header.Version) + } + + return sc, nil +} + +// NewWritable creates a new WritableStorage as defined by +// github.com/ipld/go-ipld-prime/storage that writes a CARv1 or CARv2 format to +// the given io.Writer. +// +// The returned WritableStorage is compatible with a linksystem SetWriteStorage +// method as defined by github.com/ipld/go-ipld-prime/linking +// to provide a block sink backed by a CAR. +// +// The WritableStorage supports Put operations, which will write +// blocks to the CAR in the order they are received. +// +// When writing a CARv2 format (the default), the provided writer must be +// compatible with io.WriterAt in order to provide random access as the CARv2 +// header must be written after the blocks in order to indicate the size of the +// CARv2 data payload. +// +// A CARv1 (generated using the WriteAsCarV1 option) only requires an io.Writer +// and can therefore stream CAR contents as blocks are written while still +// providing Has operations and the ability to avoid writing duplicate blocks +// as required. +// +// When writing a CARv2 format, it is important to call the Finalize method on +// the returned WritableStorage in order to write the CARv2 header and index. +func NewWritable(writer io.Writer, roots []cid.Cid, opts ...carv2.Option) (WritableCar, error) { + sc, err := newWritable(writer, roots, opts...) + if err != nil { + return nil, err + } + return sc.init() +} + +func newWritable(writer io.Writer, roots []cid.Cid, opts ...carv2.Option) (*StorageCar, error) { + sc := &StorageCar{ + writer: &positionTrackingWriter{w: writer}, + idx: store.NewInsertionIndex(), + header: carv2.NewHeader(0), + opts: carv2.ApplyOptions(opts...), + roots: roots, + } + + if p := sc.opts.DataPadding; p > 0 { + sc.header = sc.header.WithDataPadding(p) + } + if p := sc.opts.IndexPadding; p > 0 { + sc.header = sc.header.WithIndexPadding(p) + } + + offset := int64(sc.header.DataOffset) + if sc.opts.WriteAsCarV1 { + offset = 0 + } + + if writerAt, ok := writer.(io.WriterAt); ok { + sc.dataWriter = internalio.NewOffsetWriter(writerAt, offset) + } else { + if !sc.opts.WriteAsCarV1 { + return nil, fmt.Errorf("cannot write as CARv2 to a non-seekable writer") + } + } + + return sc, nil +} + +func newReadableWritable(rw ReaderAtWriterAt, roots []cid.Cid, opts ...carv2.Option) (*StorageCar, error) { + sc, err := newWritable(rw, roots, opts...) + if err != nil { + return nil, err + } + + sc.reader = rw + if !sc.opts.WriteAsCarV1 { + sc.reader, err = internalio.NewOffsetReadSeeker(rw, int64(sc.header.DataOffset)) + if err != nil { + return nil, err + } + } + + return sc, nil +} + +// NewReadableWritable creates a new StorageCar that is able to provide both +// StorageReader and StorageWriter functionality. +// +// The returned StorageCar is compatible with a linksystem SetReadStorage and +// SetWriteStorage methods as defined by github.com/ipld/go-ipld-prime/linking. +// +// When writing a CARv2 format, it is important to call the Finalize method on +// the returned WritableStorage in order to write the CARv2 header and index. +func NewReadableWritable(rw ReaderAtWriterAt, roots []cid.Cid, opts ...carv2.Option) (*StorageCar, error) { + sc, err := newReadableWritable(rw, roots, opts...) + if err != nil { + return nil, err + } + if _, err := sc.init(); err != nil { + return nil, err + } + return sc, nil +} + +// OpenReadableWritable creates a new StorageCar that is able to provide both +// StorageReader and StorageWriter functionality. +// +// The returned StorageCar is compatible with a linksystem SetReadStorage and +// SetWriteStorage methods as defined by github.com/ipld/go-ipld-prime/linking. +// +// It attempts to resume a CARv2 file that was previously written to by +// NewWritable, or NewReadableWritable. +func OpenReadableWritable(rw ReaderAtWriterAt, roots []cid.Cid, opts ...carv2.Option) (*StorageCar, error) { + sc, err := newReadableWritable(rw, roots, opts...) + if err != nil { + return nil, err + } + + // attempt to resume + rs, err := internalio.NewOffsetReadSeeker(rw, 0) + if err != nil { + return nil, err + } + if err := store.ResumableVersion(rs, sc.opts.WriteAsCarV1); err != nil { + return nil, err + } + if err := store.Resume( + rw, + sc.reader, + sc.dataWriter, + sc.idx.(*store.InsertionIndex), + roots, + sc.header.DataOffset, + sc.opts.WriteAsCarV1, + sc.opts.MaxAllowedHeaderSize, + sc.opts.ZeroLengthSectionAsEOF, + ); err != nil { + return nil, err + } + return sc, nil +} + +func (sc *StorageCar) init() (WritableCar, error) { + if !sc.opts.WriteAsCarV1 { + if _, err := sc.writer.Write(carv2.Pragma); err != nil { + return nil, err + } + } + var w io.Writer = sc.dataWriter + if sc.dataWriter == nil { + w = sc.writer + } + if err := carv1.WriteHeader(&carv1.CarHeader{Roots: sc.roots, Version: 1}, w); err != nil { + return nil, err + } + return sc, nil +} + +// Roots returns the roots of the CAR. +func (sc *StorageCar) Roots() []cid.Cid { + return sc.roots +} + +// Put adds a block to the CAR, where the block is identified by the given CID +// provided in string form. The keyStr value must be a valid CID binary string +// (not a multibase string representation), i.e. generated with CID#KeyString(). +func (sc *StorageCar) Put(ctx context.Context, keyStr string, data []byte) error { + keyCid, err := cid.Cast([]byte(keyStr)) + if err != nil { + return fmt.Errorf("bad CID key: %w", err) + } + + sc.mu.Lock() + defer sc.mu.Unlock() + + if sc.closed { + return errClosed + } + + idx, ok := sc.idx.(*store.InsertionIndex) + if !ok || sc.writer == nil { + return fmt.Errorf("cannot put into a read-only CAR") + } + + if should, err := store.ShouldPut( + idx, + keyCid, + sc.opts.MaxIndexCidSize, + sc.opts.StoreIdentityCIDs, + sc.opts.BlockstoreAllowDuplicatePuts, + sc.opts.BlockstoreUseWholeCIDs, + ); err != nil { + return err + } else if !should { + return nil + } + + w := sc.writer + if sc.dataWriter != nil { + w = sc.dataWriter + } + n := uint64(w.Position()) + if err := util.LdWrite(w, keyCid.Bytes(), data); err != nil { + return err + } + idx.InsertNoReplace(keyCid, n) + + return nil +} + +// Has returns true if the CAR contains a block identified by the given CID +// provided in string form. The keyStr value must be a valid CID binary string +// (not a multibase string representation), i.e. generated with CID#KeyString(). +func (sc *StorageCar) Has(ctx context.Context, keyStr string) (bool, error) { + keyCid, err := cid.Cast([]byte(keyStr)) + if err != nil { + return false, fmt.Errorf("bad CID key: %w", err) + } + + sc.mu.RLock() + defer sc.mu.RUnlock() + + if sc.closed { + return false, errClosed + } + + if idx, ok := sc.idx.(*store.InsertionIndex); ok && sc.writer != nil { + // writable CAR, fast path using InsertionIndex + return store.Has( + idx, + keyCid, + sc.opts.MaxIndexCidSize, + sc.opts.StoreIdentityCIDs, + sc.opts.BlockstoreAllowDuplicatePuts, + sc.opts.BlockstoreUseWholeCIDs, + ) + } + + if !sc.opts.StoreIdentityCIDs { + // If we don't store identity CIDs then we can return them straight away as if they are here, + // otherwise we need to check for their existence. + // Note, we do this without locking, since there is no shared information to lock for in order to perform the check. + if _, ok, err := store.IsIdentity(keyCid); err != nil { + return false, err + } else if ok { + return true, nil + } + } + + _, _, size, err := store.FindCid( + sc.reader, + sc.idx, + keyCid, + sc.opts.BlockstoreUseWholeCIDs, + sc.opts.ZeroLengthSectionAsEOF, + sc.opts.MaxAllowedSectionSize, + false, + ) + if errors.Is(err, index.ErrNotFound) { + return false, nil + } else if err != nil { + return false, err + } + return size > -1, nil +} + +// Get returns the block bytes identified by the given CID provided in string +// form. The keyStr value must be a valid CID binary string (not a multibase +// string representation), i.e. generated with CID#KeyString(). +func (sc *StorageCar) Get(ctx context.Context, keyStr string) ([]byte, error) { + rdr, err := sc.GetStream(ctx, keyStr) + if err != nil { + return nil, err + } + return io.ReadAll(rdr) +} + +// GetStream returns a stream of the block bytes identified by the given CID +// provided in string form. The keyStr value must be a valid CID binary string +// (not a multibase string representation), i.e. generated with CID#KeyString(). +func (sc *StorageCar) GetStream(ctx context.Context, keyStr string) (io.ReadCloser, error) { + if sc.reader == nil { + return nil, fmt.Errorf("cannot read from a write-only CAR") + } + + keyCid, err := cid.Cast([]byte(keyStr)) + if err != nil { + return nil, fmt.Errorf("bad CID key: %w", err) + } + + if !sc.opts.StoreIdentityCIDs { + // If we don't store identity CIDs then we can return them straight away as if they are here, + // otherwise we need to check for their existence. + // Note, we do this without locking, since there is no shared information to lock for in order to perform the check. + if digest, ok, err := store.IsIdentity(keyCid); err != nil { + return nil, err + } else if ok { + return io.NopCloser(bytes.NewReader(digest)), nil + } + } + + sc.mu.RLock() + defer sc.mu.RUnlock() + + if sc.closed { + return nil, errClosed + } + + _, offset, size, err := store.FindCid( + sc.reader, + sc.idx, + keyCid, + sc.opts.BlockstoreUseWholeCIDs, + sc.opts.ZeroLengthSectionAsEOF, + sc.opts.MaxAllowedSectionSize, + false, + ) + if errors.Is(err, index.ErrNotFound) { + return nil, ErrNotFound{Cid: keyCid} + } else if err != nil { + return nil, err + } + return io.NopCloser(io.NewSectionReader(sc.reader, offset, int64(size))), nil +} + +// Finalize writes the CAR index to the underlying writer if the CAR being +// written is a CARv2. It also writes a finalized CARv2 header which details +// payload location. This should be called on a writable StorageCar in order to +// avoid data loss. +func (sc *StorageCar) Finalize() error { + idx, ok := sc.idx.(*store.InsertionIndex) + if !ok || sc.writer == nil { + // ignore this, it's not writable + return nil + } + + if sc.opts.WriteAsCarV1 { + return nil + } + + wat, ok := sc.writer.(*positionTrackingWriter).w.(io.WriterAt) + if !ok { // should should already be checked at construction if this is a writable + return fmt.Errorf("cannot finalize a CARv2 without an io.WriterAt") + } + + sc.mu.Lock() + defer sc.mu.Unlock() + + if sc.closed { + // Allow duplicate Finalize calls, just like Close. + // Still error, just like ReadOnly.Close; it should be discarded. + return fmt.Errorf("called Finalize on a closed storage CAR") + } + + sc.closed = true + + return store.Finalize(wat, sc.header, idx, uint64(sc.dataWriter.Position()), sc.opts.StoreIdentityCIDs, sc.opts.IndexCodec) +} + +type positionTrackingWriter struct { + w io.Writer + offset int64 +} + +func (ptw *positionTrackingWriter) Write(p []byte) (int, error) { + written, err := ptw.w.Write(p) + ptw.offset += int64(written) + return written, err +} + +func (ptw *positionTrackingWriter) Position() int64 { + return ptw.offset +} diff --git a/ipld/car/v2/storage/storage_test.go b/ipld/car/v2/storage/storage_test.go new file mode 100644 index 0000000000..12b9ebce32 --- /dev/null +++ b/ipld/car/v2/storage/storage_test.go @@ -0,0 +1,1276 @@ +package storage_test + +// TODO: test readable can't write and writable can't read + +import ( + "bytes" + "context" + "crypto/sha512" + "errors" + "fmt" + "io" + "math/rand" + "os" + "path/filepath" + "sync" + "testing" + "time" + + "github.com/ipfs/go-cid" + carv2 "github.com/ipld/go-car/v2" + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/ipld/go-car/v2/storage" + "github.com/multiformats/go-multicodec" + "github.com/multiformats/go-multihash" + "github.com/stretchr/testify/require" +) + +var rng = rand.New(rand.NewSource(1413)) +var rngLk sync.Mutex + +func TestReadable(t *testing.T) { + tests := []struct { + name string + inputPath string + opts []carv2.Option + noIdCids bool + }{ + { + "OpenedWithCarV1", + "../testdata/sample-v1.car", + []carv2.Option{carv2.UseWholeCIDs(true), carv2.StoreIdentityCIDs(true)}, + false, + }, + { + "OpenedWithCarV1_NoIdentityCID", + "../testdata/sample-v1.car", + []carv2.Option{carv2.UseWholeCIDs(true)}, + false, + }, + { + "OpenedWithCarV2", + "../testdata/sample-wrapped-v2.car", + []carv2.Option{carv2.UseWholeCIDs(true), carv2.StoreIdentityCIDs(true)}, + // index already exists, but was made without identity CIDs, but opening with StoreIdentityCIDs(true) means we check the index + true, + }, + { + "OpenedWithCarV2_NoIdentityCID", + "../testdata/sample-wrapped-v2.car", + []carv2.Option{carv2.UseWholeCIDs(true)}, + false, + }, + { + "OpenedWithCarV1ZeroLenSection", + "../testdata/sample-v1-with-zero-len-section.car", + []carv2.Option{carv2.UseWholeCIDs(true), carv2.ZeroLengthSectionAsEOF(true)}, + false, + }, + { + "OpenedWithAnotherCarV1ZeroLenSection", + "../testdata/sample-v1-with-zero-len-section2.car", + []carv2.Option{carv2.UseWholeCIDs(true), carv2.ZeroLengthSectionAsEOF(true)}, + false, + }, + { + "IndexlessV2", + "../testdata/sample-v2-indexless.car", + []carv2.Option{carv2.UseWholeCIDs(true)}, + false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + // Setup new StorageCar + inputReader, err := os.Open(tt.inputPath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, inputReader.Close()) }) + readable, err := storage.OpenReadable(inputReader, tt.opts...) + require.NoError(t, err) + + // Setup BlockReader to compare against + actualReader, err := os.Open(tt.inputPath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, actualReader.Close()) }) + actual, err := carv2.NewBlockReader(actualReader, tt.opts...) + require.NoError(t, err) + + // Assert roots match v1 payload. + require.Equal(t, actual.Roots, readable.Roots()) + + for { + wantBlock, err := actual.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + + key := wantBlock.Cid() + + // Assert StorageCar contains key. + has, err := readable.Has(ctx, key.KeyString()) + require.NoError(t, err) + if key.Prefix().MhType == uint64(multicodec.Identity) && tt.noIdCids { + // fixture wasn't made with StoreIdentityCIDs, but we opened it with StoreIdentityCIDs, + // so they aren't there to find + require.False(t, has) + } else { + require.True(t, has) + } + + // Assert block itself matches v1 payload block. + if has { + gotBlock, err := readable.Get(ctx, key.KeyString()) + require.NoError(t, err) + require.Equal(t, wantBlock.RawData(), gotBlock) + + reader, err := readable.GetStream(ctx, key.KeyString()) + require.NoError(t, err) + data, err := io.ReadAll(reader) + require.NoError(t, err) + require.Equal(t, wantBlock.RawData(), data) + } + } + + // test not exists + c := randCid() + has, err := readable.Has(ctx, c.KeyString()) + require.NoError(t, err) + require.False(t, has) + + _, err = readable.Get(ctx, c.KeyString()) + require.True(t, errors.Is(err, storage.ErrNotFound{})) + require.True(t, storage.IsNotFound(err)) + require.Contains(t, err.Error(), c.String()) + + // random identity, should only find this if we _don't_ store identity CIDs + storeIdentity := carv2.ApplyOptions(tt.opts...).StoreIdentityCIDs + c = randIdentityCid() + + has, err = readable.Has(ctx, c.KeyString()) + require.NoError(t, err) + require.Equal(t, !storeIdentity, has) + + got, err := readable.Get(ctx, c.KeyString()) + if !storeIdentity { + require.NoError(t, err) + mh, err := multihash.Decode(c.Hash()) + require.NoError(t, err) + require.Equal(t, mh.Digest, got) + } else { + require.True(t, errors.Is(err, storage.ErrNotFound{})) + require.True(t, storage.IsNotFound(err)) + require.Contains(t, err.Error(), c.String()) + } + }) + } +} + +func TestReadableBadVersion(t *testing.T) { + f, err := os.Open("../testdata/sample-rootless-v42.car") + require.NoError(t, err) + t.Cleanup(func() { f.Close() }) + subject, err := storage.OpenReadable(f) + require.Errorf(t, err, "unsupported car version: 42") + require.Nil(t, subject) +} + +func TestWritable(t *testing.T) { + originalCarV1Path := "../testdata/sample-v1.car" + + variants := []struct { + name string + compareCarV1 string + options []carv2.Option + expectedV1StartOffset int64 + }{ + {"carv2_noopt", "sample-v1-noidentity.car", []carv2.Option{}, int64(carv2.PragmaSize + carv2.HeaderSize)}, + {"carv2_identity", "sample-v1.car", []carv2.Option{carv2.StoreIdentityCIDs(true)}, int64(carv2.PragmaSize + carv2.HeaderSize)}, + {"carv1", "sample-v1-noidentity.car", []carv2.Option{carv2.WriteAsCarV1(true)}, int64(0)}, + {"carv1_identity", "sample-v1.car", []carv2.Option{carv2.WriteAsCarV1(true), carv2.StoreIdentityCIDs(true)}, int64(0)}, + } + + for _, mode := range []string{"WithRead", "WithoutRead"} { + t.Run(mode, func(t *testing.T) { + for _, variant := range variants { + t.Run(variant.name, func(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + opts := carv2.ApplyOptions(variant.options...) + + // Setup input file using standard CarV1 reader + srcFile, err := os.Open(originalCarV1Path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, srcFile.Close()) }) + r, err := carv1.NewCarReader(srcFile) + require.NoError(t, err) + + path := filepath.Join(t.TempDir(), fmt.Sprintf("writable_%s_%s.car", mode, variant.name)) + var dstFile *os.File + + var writable *storage.StorageCar + if mode == "WithoutRead" { + dstFile, err = os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0644) + require.NoError(t, err) + t.Cleanup(func() { dstFile.Close() }) + var writer io.Writer = &writerOnly{dstFile} + if !opts.WriteAsCarV1 { + writer = &writerAtOnly{dstFile} + } + w, err := storage.NewWritable(writer, r.Header.Roots, variant.options...) + require.NoError(t, err) + writable = w.(*storage.StorageCar) + } else { + dstFile, err = os.OpenFile(path, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0644) + require.NoError(t, err) + t.Cleanup(func() { dstFile.Close() }) + writable, err = storage.NewReadableWritable(dstFile, r.Header.Roots, variant.options...) + require.NoError(t, err) + } + + require.Equal(t, r.Header.Roots, writable.Roots()) + + cids := make([]cid.Cid, 0) + var idCidCount int + for { + // read from source + b, err := r.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + + // write to dest + err = writable.Put(ctx, b.Cid().KeyString(), b.RawData()) + require.NoError(t, err) + cids = append(cids, b.Cid()) + + dmh, err := multihash.Decode(b.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + idCidCount++ + } + + if mode == "WithRead" { + // writable is a ReadableWritable / StorageCar + + // read back out the one we just wrote + gotBlock, err := writable.Get(ctx, b.Cid().KeyString()) + require.NoError(t, err) + require.Equal(t, b.RawData(), gotBlock) + + reader, err := writable.GetStream(ctx, b.Cid().KeyString()) + require.NoError(t, err) + data, err := io.ReadAll(reader) + require.NoError(t, err) + require.Equal(t, b.RawData(), data) + + // try reading a random one: + candIndex := rng.Intn(len(cids)) + var candidate cid.Cid + for _, c := range cids { + if candIndex == 0 { + candidate = c + break + } + candIndex-- + } + has, err := writable.Has(ctx, candidate.KeyString()) + require.NoError(t, err) + require.True(t, has) + + // not exists + c := randCid() + has, err = writable.Has(ctx, c.KeyString()) + require.NoError(t, err) + require.False(t, has) + _, err = writable.Get(ctx, c.KeyString()) + require.True(t, errors.Is(err, storage.ErrNotFound{})) + require.True(t, storage.IsNotFound(err)) + require.Contains(t, err.Error(), c.String()) + + // random identity, should only find this if we _don't_ store identity CIDs + c = randIdentityCid() + has, err = writable.Has(ctx, c.KeyString()) + require.NoError(t, err) + require.Equal(t, !opts.StoreIdentityCIDs, has) + + got, err := writable.Get(ctx, c.KeyString()) + if !opts.StoreIdentityCIDs { + require.NoError(t, err) + mh, err := multihash.Decode(c.Hash()) + require.NoError(t, err) + require.Equal(t, mh.Digest, got) + } else { + require.True(t, errors.Is(err, storage.ErrNotFound{})) + require.True(t, storage.IsNotFound(err)) + require.Contains(t, err.Error(), c.String()) + } + } + } + + err = writable.Finalize() + require.NoError(t, err) + + err = dstFile.Close() + require.NoError(t, err) + + // test header version using carv2 reader + reopen, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, reopen.Close()) }) + rd, err := carv2.NewReader(reopen) + require.NoError(t, err) + require.Equal(t, opts.WriteAsCarV1, rd.Version == 1) + + // now compare the binary contents of the written file to the expected file + comparePath := filepath.Join("../testdata/", variant.compareCarV1) + compareStat, err := os.Stat(comparePath) + require.NoError(t, err) + + wrote, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, wrote.Close()) }) + _, err = wrote.Seek(variant.expectedV1StartOffset, io.SeekStart) + require.NoError(t, err) + hasher := sha512.New() + gotWritten, err := io.Copy(hasher, io.LimitReader(wrote, compareStat.Size())) + require.NoError(t, err) + gotSum := hasher.Sum(nil) + + hasher.Reset() + compareV1, err := os.Open(comparePath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, compareV1.Close()) }) + wantWritten, err := io.Copy(hasher, compareV1) + require.NoError(t, err) + wantSum := hasher.Sum(nil) + + require.Equal(t, wantWritten, gotWritten) + require.Equal(t, wantSum, gotSum) + }) + } + }) + } +} + +func TestCannotWriteableV2WithoutWriterAt(t *testing.T) { + w, err := storage.NewWritable(&writerOnly{os.Stdout}, []cid.Cid{}) + require.Error(t, err) + require.Nil(t, w) +} + +func TestErrorsWhenWritingCidTooLarge(t *testing.T) { + maxAllowedCidSize := uint64(20) + + path := filepath.Join(t.TempDir(), "writable-with-id-enabled-too-large.car") + out, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY, 0644) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, out.Close()) }) + subject, err := storage.NewWritable(out, []cid.Cid{}, carv2.MaxIndexCidSize(maxAllowedCidSize)) + require.NoError(t, err) + + // normal block but shorten the CID to make it acceptable + testCid, testData := randBlock() + mh, err := multihash.Decode(testCid.Hash()) + require.NoError(t, err) + dig := mh.Digest[:10] + shortMh, err := multihash.Encode(dig, mh.Code) + require.NoError(t, err) + testCid = cid.NewCidV1(mh.Code, shortMh) + + err = subject.Put(context.TODO(), testCid.KeyString(), testData) + require.NoError(t, err) + + // standard CID but too long for options + testCid, testData = randBlock() + err = subject.Put(context.TODO(), testCid.KeyString(), testData) + require.Equal(t, &carv2.ErrCidTooLarge{MaxSize: maxAllowedCidSize, CurrentSize: uint64(testCid.ByteLen())}, err) +} + +func TestConcurrentUse(t *testing.T) { + dst, err := os.OpenFile(filepath.Join(t.TempDir(), "readwrite.car"), os.O_CREATE|os.O_RDWR, 0644) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, dst.Close()) }) + wbs, err := storage.NewReadableWritable(dst, nil) + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + require.NoError(t, err) + t.Cleanup(func() { wbs.Finalize() }) + + var wg sync.WaitGroup + for i := 0; i < 100; i++ { + wg.Add(1) + go func() { + defer wg.Done() + + testCid, testData := randBlock() + + has, err := wbs.Has(ctx, testCid.KeyString()) + require.NoError(t, err) + require.False(t, has) + + err = wbs.Put(ctx, testCid.KeyString(), testData) + require.NoError(t, err) + + got, err := wbs.Get(ctx, testCid.KeyString()) + require.NoError(t, err) + require.Equal(t, testData, got) + }() + } + wg.Wait() +} + +func TestNullPadding(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + paddedV1, err := os.ReadFile("../testdata/sample-v1-with-zero-len-section.car") + require.NoError(t, err) + + readable, err := storage.OpenReadable(bufferReaderAt(paddedV1), carv2.ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + + roots := readable.Roots() + require.Len(t, roots, 1) + has, err := readable.Has(ctx, roots[0].KeyString()) + require.NoError(t, err) + require.True(t, has) + + actual, err := carv2.NewBlockReader(bytes.NewReader(paddedV1), carv2.ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + + for { + wantBlock, err := actual.Next() + if err == io.EOF { + break + } + require.NoError(t, err) + b, err := readable.Get(ctx, wantBlock.Cid().KeyString()) + require.NoError(t, err) + require.Equal(t, wantBlock.RawData(), b) + } +} + +func TestPutSameHashes(t *testing.T) { + tdir := t.TempDir() + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + // This writable allows duplicate puts, and identifies by multihash as per the default. + pathAllowDups := filepath.Join(tdir, "writable-allowdup.car") + dstAllowDups, err := os.OpenFile(pathAllowDups, os.O_CREATE|os.O_RDWR, 0644) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, dstAllowDups.Close()) }) + wbsAllowDups, err := storage.NewReadableWritable(dstAllowDups, nil, carv2.AllowDuplicatePuts(true)) + require.NoError(t, err) + + // This writable deduplicates puts by CID. + pathByCID := filepath.Join(tdir, "writable-dedup-wholecid.car") + dstByCID, err := os.OpenFile(pathByCID, os.O_CREATE|os.O_RDWR, 0644) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, dstByCID.Close()) }) + wbsByCID, err := storage.NewReadableWritable(dstByCID, nil, carv2.UseWholeCIDs(true)) + require.NoError(t, err) + + // This writable deduplicates puts by multihash + pathByHash := filepath.Join(tdir, "writable-dedup-byhash.car") + dstByHash, err := os.OpenFile(pathByHash, os.O_CREATE|os.O_RDWR, 0644) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, dstByHash.Close()) }) + wbsByHash, err := storage.NewReadableWritable(dstByHash, nil) + require.NoError(t, err) + + var blockList []struct { + cid cid.Cid + data []byte + } + + appendBlock := func(data []byte, version, codec uint64) { + c, err := cid.Prefix{ + Version: version, + Codec: codec, + MhType: multihash.SHA2_256, + MhLength: -1, + }.Sum(data) + require.NoError(t, err) + blockList = append(blockList, struct { + cid cid.Cid + data []byte + }{c, data}) + } + + // Two raw blocks, meaning we have two unique multihashes. + // However, we have multiple CIDs for each multihash. + // We also have two duplicate CIDs. + data1 := []byte("foo bar") + appendBlock(data1, 0, cid.DagProtobuf) + appendBlock(data1, 1, cid.DagProtobuf) + appendBlock(data1, 1, cid.DagCBOR) + appendBlock(data1, 1, cid.DagCBOR) // duplicate CID + + data2 := []byte("foo bar baz") + appendBlock(data2, 0, cid.DagProtobuf) + appendBlock(data2, 1, cid.DagProtobuf) + appendBlock(data2, 1, cid.DagProtobuf) // duplicate CID + appendBlock(data2, 1, cid.DagCBOR) + + countBlocks := func(path string) int { + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, f.Close()) }) + rdr, err := carv2.NewBlockReader(f) + require.NoError(t, err) + + n := 0 + for { + _, err := rdr.Next() + if err == io.EOF { + break + } + n++ + } + return n + } + + putBlockList := func(writable *storage.StorageCar) { + for i, block := range blockList { + // Has should never error here. + // The first block should be missing. + // Others might not, given the duplicate hashes. + has, err := writable.Has(ctx, block.cid.KeyString()) + require.NoError(t, err) + if i == 0 { + require.False(t, has) + } + + err = writable.Put(ctx, block.cid.KeyString(), block.data) + require.NoError(t, err) + + // Has and Get need to work right after a Put + has, err = writable.Has(ctx, block.cid.KeyString()) + require.NoError(t, err) + require.True(t, has) + + got, err := writable.Get(ctx, block.cid.KeyString()) + require.NoError(t, err) + require.Equal(t, block.data, got) + } + } + + putBlockList(wbsAllowDups) + err = wbsAllowDups.Finalize() + require.NoError(t, err) + require.Equal(t, len(blockList), countBlocks(pathAllowDups)) + + // Put the same list of blocks to the CAR that deduplicates by CID. + // We should end up with two fewer blocks, as two are entire CID duplicates. + putBlockList(wbsByCID) + err = wbsByCID.Finalize() + require.NoError(t, err) + require.Equal(t, len(blockList)-2, countBlocks(pathByCID)) + + // Put the same list of blocks to the CAR that deduplicates by CID. + // We should end up with just two blocks, as the original set of blocks only + // has two distinct multihashes. + putBlockList(wbsByHash) + err = wbsByHash.Finalize() + require.NoError(t, err) + require.Equal(t, 2, countBlocks(pathByHash)) +} + +func TestReadableCantWrite(t *testing.T) { + inp, err := os.Open("../testdata/sample-v1.car") + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, inp.Close()) }) + readable, err := storage.OpenReadable(inp) + require.NoError(t, err) + require.ErrorContains(t, readable.(*storage.StorageCar).Put(context.Background(), randCid().KeyString(), []byte("bar")), "read-only") + // Finalize() is nonsense for a readable, but it should be safe + require.NoError(t, readable.(*storage.StorageCar).Finalize()) +} + +func TestWritableCantRead(t *testing.T) { + // an io.Writer with no io.WriterAt capabilities + path := filepath.Join(t.TempDir(), "writable.car") + out, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, out.Close()) }) + + // This should fail because the writer is not an io.WriterAt + _, err = storage.NewWritable(&writerOnly{out}, nil) + require.ErrorContains(t, err, "CARv2") + require.ErrorContains(t, err, "non-seekable") + + writable, err := storage.NewWritable(&writerOnly{out}, nil, carv2.WriteAsCarV1(true)) + require.NoError(t, err) + + _, err = writable.(*storage.StorageCar).Get(context.Background(), randCid().KeyString()) + require.ErrorContains(t, err, "write-only") + + _, err = writable.(*storage.StorageCar).GetStream(context.Background(), randCid().KeyString()) + require.ErrorContains(t, err, "write-only") + + require.NoError(t, writable.Finalize()) +} + +func TestReadWriteWithPaddingWorksAsExpected(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + testCid1, testData1 := randBlock() + testCid2, testData2 := randBlock() + + wantRoots := []cid.Cid{testCid1, testCid2} + path := filepath.Join(t.TempDir(), "readwrite-with-padding.car") + writer, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, writer.Close()) }) + + wantCarV1Padding := uint64(1413) + wantIndexPadding := uint64(1314) + subject, err := storage.NewReadableWritable( + writer, + wantRoots, + carv2.UseDataPadding(wantCarV1Padding), + carv2.UseIndexPadding(wantIndexPadding)) + require.NoError(t, err) + require.NoError(t, subject.Put(ctx, testCid1.KeyString(), testData1)) + require.NoError(t, subject.Put(ctx, testCid2.KeyString(), testData2)) + require.NoError(t, subject.Finalize()) + + // Assert CARv2 header contains right offsets. + gotCarV2, err := carv2.OpenReader(path) + t.Cleanup(func() { gotCarV2.Close() }) + require.NoError(t, err) + wantCarV1Offset := carv2.PragmaSize + carv2.HeaderSize + wantCarV1Padding + wantIndexOffset := wantCarV1Offset + gotCarV2.Header.DataSize + wantIndexPadding + require.Equal(t, wantCarV1Offset, gotCarV2.Header.DataOffset) + require.Equal(t, wantIndexOffset, gotCarV2.Header.IndexOffset) + require.NoError(t, gotCarV2.Close()) + + f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { f.Close() }) + + // Assert reading CARv1 directly at offset and size is as expected. + gotCarV1, err := carv1.NewCarReader(io.NewSectionReader(f, int64(wantCarV1Offset), int64(gotCarV2.Header.DataSize))) + require.NoError(t, err) + require.Equal(t, wantRoots, gotCarV1.Header.Roots) + gotBlock, err := gotCarV1.Next() + require.NoError(t, err) + require.Equal(t, testCid1, gotBlock.Cid()) + require.Equal(t, testData1, gotBlock.RawData()) + gotBlock, err = gotCarV1.Next() + require.NoError(t, err) + require.Equal(t, testCid2, gotBlock.Cid()) + require.Equal(t, testData2, gotBlock.RawData()) + + _, err = gotCarV1.Next() + require.Equal(t, io.EOF, err) + + // Assert reading index directly from file is parsable and has expected CIDs. + stat, err := f.Stat() + require.NoError(t, err) + indexSize := stat.Size() - int64(wantIndexOffset) + gotIdx, err := index.ReadFrom(io.NewSectionReader(f, int64(wantIndexOffset), indexSize)) + require.NoError(t, err) + _, err = index.GetFirst(gotIdx, testCid1) + require.NoError(t, err) + _, err = index.GetFirst(gotIdx, testCid2) + require.NoError(t, err) +} + +func TestResumption(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + srcPath := "../testdata/sample-v1.car" + + v1f, err := os.Open(srcPath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v1f.Close()) }) + rd, err := carv2.NewReader(v1f) + require.NoError(t, err) + roots, err := rd.Roots() + require.NoError(t, err) + + blockSource := func() <-chan simpleBlock { + v1f, err := os.Open(srcPath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v1f.Close()) }) + r, err := carv1.NewCarReader(v1f) + require.NoError(t, err) + ret := make(chan simpleBlock) + + go func() { + for { + b, err := r.Next() + if err == io.EOF { + close(ret) + break + } + require.NoError(t, err) + ret <- simpleBlock{cid: b.Cid(), data: b.RawData()} + } + }() + + return ret + } + + path := filepath.Join(t.TempDir(), "readwrite-resume.car") + writer, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, writer.Close()) }) + // Create an incomplete CARv2 file with no blocks put. + subject, err := storage.NewReadableWritable(writer, roots, carv2.UseWholeCIDs(true)) + require.NoError(t, err) + + // For each block resume on the same file, putting blocks one at a time. + var wantBlockCountSoFar, idCidCount int + wantBlocks := make(map[cid.Cid]simpleBlock) + for b := range blockSource() { + wantBlockCountSoFar++ + wantBlocks[b.cid] = b + + dmh, err := multihash.Decode(b.cid.Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + idCidCount++ + } + + // 30% chance of subject failing; more concretely: re-instantiating the StorageCar with the same + // file without calling Finalize. The higher this percentage the slower the test runs + // considering the number of blocks in the original CARv1 test payload. + resume := rng.Float32() <= 0.3 + // If testing resume case, then flip a coin to decide whether to finalize before the StorageCar + // re-instantiation or not. Note, both cases should work for resumption since we do not + // limit resumption to unfinalized files. + finalizeBeforeResumption := rng.Float32() <= 0.5 + if resume { + if finalizeBeforeResumption { + require.NoError(t, subject.Finalize()) + } + + _, err := writer.Seek(0, io.SeekStart) + require.NoError(t, err) + subject, err = storage.OpenReadableWritable(writer, roots, carv2.UseWholeCIDs(true)) + require.NoError(t, err) + } + require.NoError(t, subject.Put(ctx, b.cid.KeyString(), b.data)) + + // With 10% chance test read operations on an resumed read-write StorageCar. + // We don't test on every put to reduce test runtime. + testRead := rng.Float32() <= 0.1 + if testRead { + // Assert read operations on the read-write StorageCar are as expected when resumed from an + // existing file + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + t.Cleanup(cancel) + for k, wantBlock := range wantBlocks { + has, err := subject.Has(ctx, k.KeyString()) + require.NoError(t, err) + require.True(t, has) + gotBlock, err := subject.Get(ctx, k.KeyString()) + require.NoError(t, err) + require.Equal(t, wantBlock.data, gotBlock) + } + // Assert the number of blocks in file are as expected calculated via AllKeysChan + require.Equal(t, wantBlockCountSoFar, len(wantBlocks)) + } + } + + // Finalize the StorageCar to complete partially written CARv2 file. + subject, err = storage.OpenReadableWritable(writer, roots, carv2.UseWholeCIDs(true)) + require.NoError(t, err) + require.NoError(t, subject.Finalize()) + + // Assert resumed from file is a valid CARv2 with index. + v2f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v2f.Close()) }) + v2r, err := carv2.NewReader(v2f) + require.NoError(t, err) + require.True(t, v2r.Header.HasIndex()) + + // Assert CARv1 payload in file matches the original CARv1 payload. + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + wantPayloadReader, err := carv1.NewCarReader(v1f) + require.NoError(t, err) + + dr, err := v2r.DataReader() + require.NoError(t, err) + gotPayloadReader, err := carv1.NewCarReader(dr) + require.NoError(t, err) + + require.Equal(t, wantPayloadReader.Header, gotPayloadReader.Header) + for { + wantNextBlock, wantErr := wantPayloadReader.Next() + if wantErr == io.EOF { + gotNextBlock, gotErr := gotPayloadReader.Next() + require.Equal(t, wantErr, gotErr) + require.Nil(t, gotNextBlock) + break + } + require.NoError(t, wantErr) + + dmh, err := multihash.Decode(wantNextBlock.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + continue + } + + gotNextBlock, gotErr := gotPayloadReader.Next() + require.NoError(t, gotErr) + require.Equal(t, wantNextBlock, gotNextBlock) + } + + // Assert index in resumed from file is identical to index generated from the data payload portion of the generated CARv2 file. + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + ir, err := v2r.IndexReader() + require.NoError(t, err) + gotIdx, err := index.ReadFrom(ir) + require.NoError(t, err) + dr, err = v2r.DataReader() + require.NoError(t, err) + wantIdx, err := carv2.GenerateIndex(dr) + require.NoError(t, err) + require.Equal(t, wantIdx, gotIdx) +} + +func TestResumptionV1(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + srcPath := "../testdata/sample-v1.car" + + v1f, err := os.Open(srcPath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v1f.Close()) }) + rd, err := carv2.NewReader(v1f) + require.NoError(t, err) + roots, err := rd.Roots() + require.NoError(t, err) + + blockSource := func() <-chan simpleBlock { + v1f, err := os.Open(srcPath) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v1f.Close()) }) + r, err := carv1.NewCarReader(v1f) + require.NoError(t, err) + ret := make(chan simpleBlock) + + go func() { + for { + b, err := r.Next() + if err == io.EOF { + close(ret) + break + } + require.NoError(t, err) + ret <- simpleBlock{cid: b.Cid(), data: b.RawData()} + } + }() + + return ret + } + + path := filepath.Join(t.TempDir(), "readwrite-resume-v1.car") + writer, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, writer.Close()) }) + // Create an incomplete CARv2 file with no blocks put. + subject, err := storage.NewReadableWritable(writer, roots, carv2.UseWholeCIDs(true), carv2.WriteAsCarV1(true)) + require.NoError(t, err) + + // For each block resume on the same file, putting blocks one at a time. + var wantBlockCountSoFar, idCidCount int + wantBlocks := make(map[cid.Cid]simpleBlock) + for b := range blockSource() { + wantBlockCountSoFar++ + wantBlocks[b.cid] = b + + dmh, err := multihash.Decode(b.cid.Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + idCidCount++ + } + + // 30% chance of subject failing; more concretely: re-instantiating the StorageCar with the same + // file without calling Finalize. The higher this percentage the slower the test runs + // considering the number of blocks in the original CARv1 test payload. + resume := rng.Float32() <= 0.3 + // If testing resume case, then flip a coin to decide whether to finalize before the StorageCar + // re-instantiation or not. Note, both cases should work for resumption since we do not + // limit resumption to unfinalized files. + finalizeBeforeResumption := rng.Float32() <= 0.5 + if resume { + if finalizeBeforeResumption { + require.NoError(t, subject.Finalize()) + } + + _, err := writer.Seek(0, io.SeekStart) + require.NoError(t, err) + subject, err = storage.OpenReadableWritable(writer, roots, carv2.UseWholeCIDs(true), carv2.WriteAsCarV1(true)) + require.NoError(t, err) + } + require.NoError(t, subject.Put(ctx, b.cid.KeyString(), b.data)) + + // With 10% chance test read operations on an resumed read-write StorageCar. + // We don't test on every put to reduce test runtime. + testRead := rng.Float32() <= 0.1 + if testRead { + // Assert read operations on the read-write StorageCar are as expected when resumed from an + // existing file + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + t.Cleanup(cancel) + for k, wantBlock := range wantBlocks { + has, err := subject.Has(ctx, k.KeyString()) + require.NoError(t, err) + require.True(t, has) + gotBlock, err := subject.Get(ctx, k.KeyString()) + require.NoError(t, err) + require.Equal(t, wantBlock.data, gotBlock) + } + // Assert the number of blocks in file are as expected calculated via AllKeysChan + require.Equal(t, wantBlockCountSoFar, len(wantBlocks)) + } + } + + // Finalize the StorageCar to complete partially written CARv2 file. + subject, err = storage.OpenReadableWritable(writer, roots, carv2.UseWholeCIDs(true), carv2.WriteAsCarV1(true)) + require.NoError(t, err) + require.NoError(t, subject.Finalize()) + + // Assert resumed from file is a valid CARv2 with index. + v2f, err := os.Open(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v2f.Close()) }) + v2r, err := carv2.NewReader(v2f) + require.NoError(t, err) + require.False(t, v2r.Header.HasIndex()) + require.Equal(t, uint64(1), v2r.Version) + + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + wantPayloadReader, err := carv1.NewCarReader(v1f) + require.NoError(t, err) + + dr, err := v2r.DataReader() // since this is a v1 we're just reading from the top with this + require.NoError(t, err) + gotPayloadReader, err := carv1.NewCarReader(dr) + require.NoError(t, err) + + require.Equal(t, wantPayloadReader.Header, gotPayloadReader.Header) + for { + wantNextBlock, wantErr := wantPayloadReader.Next() + if wantErr == io.EOF { + gotNextBlock, gotErr := gotPayloadReader.Next() + require.Equal(t, wantErr, gotErr) + require.Nil(t, gotNextBlock) + break + } + require.NoError(t, wantErr) + + dmh, err := multihash.Decode(wantNextBlock.Cid().Hash()) + require.NoError(t, err) + if dmh.Code == multihash.IDENTITY { + continue + } + + gotNextBlock, gotErr := gotPayloadReader.Next() + require.NoError(t, gotErr) + require.Equal(t, wantNextBlock, gotNextBlock) + } +} + +func TestResumptionIsSupportedOnFinalizedFile(t *testing.T) { + path := filepath.Join(t.TempDir(), "readwrite-resume-finalized.car") + v2f, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v2f.Close()) }) + // Create an incomplete CARv2 file with no blocks put. + subject, err := storage.NewReadableWritable(v2f, []cid.Cid{}) + require.NoError(t, err) + require.NoError(t, subject.Finalize()) + + reopen, err := os.OpenFile(path, os.O_RDWR, 0o666) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, reopen.Close()) }) + subject, err = storage.NewReadableWritable(reopen, []cid.Cid{}) + require.NoError(t, err) + t.Cleanup(func() { subject.Finalize() }) +} + +func TestReadWriteErrorsOnlyWhenFinalized(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + testCid1, testData1 := randBlock() + testCid2, testData2 := randBlock() + + wantRoots := []cid.Cid{testCid1, testCid2} + path := filepath.Join(t.TempDir(), "readwrite-finalized-panic.car") + writer, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, writer.Close()) }) + + subject, err := storage.NewReadableWritable(writer, wantRoots) + require.NoError(t, err) + + require.NoError(t, subject.Put(ctx, testCid1.KeyString(), testData1)) + require.NoError(t, subject.Put(ctx, testCid2.KeyString(), testData2)) + + gotBlock, err := subject.Get(ctx, testCid1.KeyString()) + require.NoError(t, err) + require.Equal(t, testData1, gotBlock) + + gotRoots := subject.Roots() + require.Equal(t, wantRoots, gotRoots) + + has, err := subject.Has(ctx, testCid1.KeyString()) + require.NoError(t, err) + require.True(t, has) + + require.NoError(t, subject.Finalize()) + require.Error(t, subject.Finalize()) + + _, ok := (interface{})(subject).(io.Closer) + require.False(t, ok) + + _, err = subject.Get(ctx, testCid1.KeyString()) + require.Error(t, err) + require.Error(t, err) + _, err = subject.Has(ctx, testCid2.KeyString()) + require.Error(t, err) + + require.Error(t, subject.Put(ctx, testCid1.KeyString(), testData1)) +} + +func TestReadWriteResumptionMismatchingRootsIsError(t *testing.T) { + tmpPath := requireTmpCopy(t, "../testdata/sample-wrapped-v2.car") + + origContent, err := os.ReadFile(tmpPath) + require.NoError(t, err) + + badRoot := randCid() + writer, err := os.OpenFile(tmpPath, os.O_RDWR, 0o666) + require.NoError(t, err) + t.Cleanup(func() { writer.Close() }) + subject, err := storage.OpenReadableWritable(writer, []cid.Cid{badRoot}) + require.EqualError(t, err, "cannot resume on file with mismatching data header") + require.Nil(t, subject) + require.NoError(t, writer.Close()) + + newContent, err := os.ReadFile(tmpPath) + require.NoError(t, err) + + // Expect the bad file to be left untouched; check the size first. + // If the sizes mismatch, printing a huge diff would not help us. + require.Equal(t, len(origContent), len(newContent)) + require.Equal(t, origContent, newContent) +} + +func requireTmpCopy(t *testing.T, src string) string { + srcF, err := os.Open(src) + require.NoError(t, err) + defer func() { require.NoError(t, srcF.Close()) }() + stats, err := srcF.Stat() + require.NoError(t, err) + + dst := filepath.Join(t.TempDir(), stats.Name()) + dstF, err := os.Create(dst) + require.NoError(t, err) + defer func() { require.NoError(t, dstF.Close()) }() + + _, err = io.Copy(dstF, srcF) + require.NoError(t, err) + return dst +} + +func TestReadWriteResumptionFromFileWithDifferentCarV1PaddingIsError(t *testing.T) { + testCid1, testData1 := randBlock() + wantRoots := []cid.Cid{testCid1} + path := filepath.Join(t.TempDir(), "readwrite-resume-with-padding.car") + writer, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, writer.Close()) }) + subject, err := storage.NewReadableWritable( + writer, + wantRoots, + carv2.UseDataPadding(1413)) + require.NoError(t, err) + require.NoError(t, subject.Put(context.TODO(), testCid1.KeyString(), testData1)) + require.NoError(t, subject.Finalize()) + + subject, err = storage.OpenReadableWritable( + writer, + wantRoots, + carv2.UseDataPadding(1314)) + require.EqualError(t, err, "cannot resume from file with mismatched CARv1 offset; "+ + "`WithDataPadding` option must match the padding on file. "+ + "Expected padding value of 1413 but got 1314") + require.Nil(t, subject) +} + +func TestOperationsErrorWithBadCidStrings(t *testing.T) { + testCid, testData := randBlock() + path := filepath.Join(t.TempDir(), "badkeys.car") + writer, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, writer.Close()) }) + subject, err := storage.NewReadableWritable(writer, []cid.Cid{}) + require.NoError(t, err) + + require.NoError(t, subject.Put(context.TODO(), testCid.KeyString(), testData)) + require.ErrorContains(t, subject.Put(context.TODO(), fmt.Sprintf("%s/nope", testCid.KeyString()), testData), "bad CID key") + require.ErrorContains(t, subject.Put(context.TODO(), "nope", testData), "bad CID key") + + has, err := subject.Has(context.TODO(), testCid.KeyString()) + require.NoError(t, err) + require.True(t, has) + has, err = subject.Has(context.TODO(), fmt.Sprintf("%s/nope", testCid.KeyString())) + require.ErrorContains(t, err, "bad CID key") + require.False(t, has) + has, err = subject.Has(context.TODO(), "nope") + require.ErrorContains(t, err, "bad CID key") + require.False(t, has) + + got, err := subject.Get(context.TODO(), testCid.KeyString()) + require.NoError(t, err) + require.NotNil(t, got) + got, err = subject.Get(context.TODO(), fmt.Sprintf("%s/nope", testCid.KeyString())) + require.ErrorContains(t, err, "bad CID key") + require.Nil(t, got) + got, err = subject.Get(context.TODO(), "nope") + require.ErrorContains(t, err, "bad CID key") + require.Nil(t, got) +} + +func TestWholeCID(t *testing.T) { + for _, whole := range []bool{true, false} { + whole := whole + t.Run(fmt.Sprintf("whole=%t", whole), func(t *testing.T) { + t.Parallel() + ctx := context.Background() + path := filepath.Join(t.TempDir(), fmt.Sprintf("writable_%t.car", whole)) + out, err := os.Create(path) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, out.Close()) }) + store, err := storage.NewReadableWritable(out, []cid.Cid{}, carv2.UseWholeCIDs(whole)) + require.NoError(t, err) + + c1, b1 := randBlock() + c2, b2 := randBlock() + + require.NoError(t, store.Put(ctx, c1.KeyString(), b1)) + has, err := store.Has(ctx, c1.KeyString()) + require.NoError(t, err) + require.True(t, has) + + pref := c1.Prefix() + pref.Codec = cid.DagProtobuf + pref.Version = 1 + cpb1, err := pref.Sum(b1) + require.NoError(t, err) + + has, err = store.Has(ctx, cpb1.KeyString()) + require.NoError(t, err) + require.Equal(t, has, !whole) + + require.NoError(t, store.Put(ctx, c2.KeyString(), b1)) + has, err = store.Has(ctx, c2.KeyString()) + require.NoError(t, err) + require.True(t, has) + has, err = store.Has(ctx, cpb1.KeyString()) + require.NoError(t, err) + require.Equal(t, has, !whole) + + pref = c2.Prefix() + pref.Codec = cid.DagProtobuf + pref.Version = 1 + cpb2, err := pref.Sum(b2) + require.NoError(t, err) + + has, err = store.Has(ctx, cpb2.KeyString()) + require.NoError(t, err) + require.Equal(t, has, !whole) + has, err = store.Has(ctx, cpb1.KeyString()) + require.NoError(t, err) + require.Equal(t, has, !whole) + }) + } +} + +type writerOnly struct { + io.Writer +} + +func (w *writerOnly) Write(p []byte) (n int, err error) { + return w.Writer.Write(p) +} + +type writerAtOnly struct { + *os.File +} + +func (w *writerAtOnly) WriteAt(p []byte, off int64) (n int, err error) { + return w.File.WriteAt(p, off) +} + +func (w *writerAtOnly) Write(p []byte) (n int, err error) { + return w.File.Write(p) +} + +func randBlock() (cid.Cid, []byte) { + data := make([]byte, 1024) + rngLk.Lock() + rng.Read(data) + rngLk.Unlock() + h, err := multihash.Sum(data, multihash.SHA2_512, -1) + if err != nil { + panic(err) + } + return cid.NewCidV1(cid.Raw, h), data +} + +func randCid() cid.Cid { + b := make([]byte, 32) + rngLk.Lock() + rng.Read(b) + rngLk.Unlock() + mh, _ := multihash.Encode(b, multihash.SHA2_256) + return cid.NewCidV1(cid.DagProtobuf, mh) +} + +func randIdentityCid() cid.Cid { + b := make([]byte, 32) + rngLk.Lock() + rng.Read(b) + rngLk.Unlock() + mh, _ := multihash.Encode(b, multihash.IDENTITY) + return cid.NewCidV1(cid.Raw, mh) +} + +type bufferReaderAt []byte + +func (b bufferReaderAt) ReadAt(p []byte, off int64) (int, error) { + if off >= int64(len(b)) { + return 0, io.EOF + } + return copy(p, b[off:]), nil +} + +type simpleBlock struct { + cid cid.Cid + data []byte +} diff --git a/ipld/car/v2/testdata/fuzz/FuzzBlockReader/c3c7eedeb4968a5b3131371ba9f5138a38c882a7fa06456595998e740a9f5a14 b/ipld/car/v2/testdata/fuzz/FuzzBlockReader/c3c7eedeb4968a5b3131371ba9f5138a38c882a7fa06456595998e740a9f5a14 new file mode 100644 index 0000000000..ae9262d491 --- /dev/null +++ b/ipld/car/v2/testdata/fuzz/FuzzBlockReader/c3c7eedeb4968a5b3131371ba9f5138a38c882a7fa06456595998e740a9f5a14 @@ -0,0 +1,2 @@ +go test fuzz v1 +[]byte("\xff\x80\xaa\x95\xa6sion\x01,\x01q\x12 \x15\x1f\xe9\xe7 dataSize { + // Truncate to the expected size to assure the resulting file is a correctly sized CARv1. + err = dst.Truncate(written) + } + + return err +} + +// AttachIndex attaches a given index to an existing CARv2 file at given path and offset. +func AttachIndex(path string, idx index.Index, offset uint64) error { + // TODO: instead of offset, maybe take padding? + // TODO: check that the given path is indeed a CARv2. + // TODO: update CARv2 header according to the offset at which index is written out. + out, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o640) + if err != nil { + return err + } + defer out.Close() + indexWriter := internalio.NewOffsetWriter(out, int64(offset)) + _, err = index.WriteTo(idx, indexWriter) + return err +} + +// ReplaceRootsInFile replaces the root CIDs in CAR file at given path with the given roots. +// This function accepts both CARv1 and CARv2 files. +// +// Note that the roots are only replaced if their total serialized size exactly matches the total +// serialized size of existing roots in CAR file. +func ReplaceRootsInFile(path string, roots []cid.Cid, opts ...Option) (err error) { + f, err := os.OpenFile(path, os.O_RDWR, 0o666) + if err != nil { + return err + } + defer func() { + // Close file and override return error type if it is nil. + if cerr := f.Close(); err == nil { + err = cerr + } + }() + + options := ApplyOptions(opts...) + + // Read header or pragma; note that both are a valid CARv1 header. + header, err := carv1.ReadHeader(f, options.MaxAllowedHeaderSize) + if err != nil { + return err + } + + var currentSize int64 + var newHeaderOffset int64 + switch header.Version { + case 1: + // When the given file is a CARv1 : + // 1. The offset at which the new header should be written is zero (newHeaderOffset = 0) + // 2. The current header size is equal to the number of bytes read, and + // + // Note that we explicitly avoid using carv1.HeaderSize to determine the current header size. + // This is based on the fact that carv1.ReadHeader does not read any extra bytes. + // Therefore, we can avoid extra allocations of carv1.HeaderSize to determine size by simply + // counting the bytes read so far. + currentSize, err = f.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + case 2: + // When the given file is a CARv2 : + // 1. The offset at which the new header should be written is carv2.Header.DataOffset + // 2. The inner CARv1 header size is equal to the number of bytes read minus carv2.Header.DataOffset + var v2h Header + if _, err = v2h.ReadFrom(f); err != nil { + return err + } + newHeaderOffset = int64(v2h.DataOffset) + if _, err = f.Seek(newHeaderOffset, io.SeekStart); err != nil { + return err + } + var innerV1Header *carv1.CarHeader + innerV1Header, err = carv1.ReadHeader(f, options.MaxAllowedHeaderSize) + if err != nil { + return err + } + if innerV1Header.Version != 1 { + err = fmt.Errorf("invalid data payload header: expected version 1, got %d", innerV1Header.Version) + } + var readSoFar int64 + readSoFar, err = f.Seek(0, io.SeekCurrent) + if err != nil { + return err + } + currentSize = readSoFar - newHeaderOffset + default: + err = fmt.Errorf("invalid car version: %d", header.Version) + return err + } + + newHeader := &carv1.CarHeader{ + Roots: roots, + Version: 1, + } + // Serialize the new header straight up instead of using carv1.HeaderSize. + // Because, carv1.HeaderSize serialises it to calculate size anyway. + // By serializing straight up we get the replacement bytes and size. + // Otherwise, we end up serializing the new header twice: + // once through carv1.HeaderSize, and + // once to write it out. + var buf bytes.Buffer + if err = carv1.WriteHeader(newHeader, &buf); err != nil { + return err + } + // Assert the header sizes are consistent. + newSize := int64(buf.Len()) + if currentSize != newSize { + return fmt.Errorf("current header size (%d) must match replacement header size (%d)", currentSize, newSize) + } + // Seek to the offset at which the new header should be written. + if _, err = f.Seek(newHeaderOffset, io.SeekStart); err != nil { + return err + } + _, err = f.Write(buf.Bytes()) + return err +} diff --git a/ipld/car/v2/writer_test.go b/ipld/car/v2/writer_test.go new file mode 100644 index 0000000000..a38ef34ade --- /dev/null +++ b/ipld/car/v2/writer_test.go @@ -0,0 +1,282 @@ +package car + +import ( + "context" + "io" + "os" + "path/filepath" + "testing" + + "github.com/ipld/go-car/v2/index" + "github.com/ipld/go-car/v2/internal/carv1" + "github.com/stretchr/testify/require" + + "github.com/ipfs/go-cid" + format "github.com/ipfs/go-ipld-format" + "github.com/ipfs/go-merkledag" + dstest "github.com/ipfs/go-merkledag/test" + "github.com/stretchr/testify/assert" +) + +func TestWrapV1(t *testing.T) { + // Produce a CARv1 file to test wrapping with. + dagSvc := dstest.Mock() + src := filepath.Join(t.TempDir(), "unwrapped-test-v1.car") + sf, err := os.Create(src) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, sf.Close()) }) + require.NoError(t, carv1.WriteCar(context.Background(), dagSvc, generateRootCid(t, dagSvc), sf)) + + // Wrap the test CARv1 file + dest := filepath.Join(t.TempDir(), "wrapped-test-v1.car") + df, err := os.Create(dest) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, df.Close()) }) + _, err = sf.Seek(0, io.SeekStart) + require.NoError(t, err) + require.NoError(t, WrapV1(sf, df)) + + // Assert wrapped file is valid CARv2 with CARv1 data payload matching the original CARv1 file. + subject, err := OpenReader(dest) + t.Cleanup(func() { require.NoError(t, subject.Close()) }) + require.NoError(t, err) + + // Assert CARv1 data payloads are identical. + _, err = sf.Seek(0, io.SeekStart) + require.NoError(t, err) + wantPayload, err := io.ReadAll(sf) + require.NoError(t, err) + dr, err := subject.DataReader() + require.NoError(t, err) + gotPayload, err := io.ReadAll(dr) + require.NoError(t, err) + require.Equal(t, wantPayload, gotPayload) + + // Assert embedded index in CARv2 is same as index generated from the original CARv1. + wantIdx, err := GenerateIndexFromFile(src) + require.NoError(t, err) + ir, err := subject.IndexReader() + require.NoError(t, err) + gotIdx, err := index.ReadFrom(ir) + require.NoError(t, err) + require.Equal(t, wantIdx, gotIdx) +} + +func TestExtractV1(t *testing.T) { + // Produce a CARv1 file to test. + dagSvc := dstest.Mock() + v1Src := filepath.Join(t.TempDir(), "original-test-v1.car") + v1f, err := os.Create(v1Src) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, v1f.Close()) }) + require.NoError(t, carv1.WriteCar(context.Background(), dagSvc, generateRootCid(t, dagSvc), v1f)) + _, err = v1f.Seek(0, io.SeekStart) + require.NoError(t, err) + wantV1, err := io.ReadAll(v1f) + require.NoError(t, err) + + // Wrap the produced CARv1 into a CARv2 to use for testing. + v2path := filepath.Join(t.TempDir(), "wrapped-for-extract-test-v2.car") + require.NoError(t, WrapV1File(v1Src, v2path)) + + // Assert extract from CARv2 file is as expected. + dstPath := filepath.Join(t.TempDir(), "extract-file-test-v1.car") + require.NoError(t, ExtractV1File(v2path, dstPath)) + gotFromFile, err := os.ReadFile(dstPath) + require.NoError(t, err) + require.Equal(t, wantV1, gotFromFile) + + // Assert extract from CARv2 file in-place is as expected + require.NoError(t, ExtractV1File(v2path, v2path)) + gotFromInPlaceFile, err := os.ReadFile(v2path) + require.NoError(t, err) + require.Equal(t, wantV1, gotFromInPlaceFile) +} + +func TestExtractV1WithUnknownVersionIsError(t *testing.T) { + dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v42.car") + err := ExtractV1File("testdata/sample-rootless-v42.car", dstPath) + require.EqualError(t, err, "source version must be 2; got: 42") +} + +func TestExtractV1FromACarV1IsError(t *testing.T) { + dstPath := filepath.Join(t.TempDir(), "extract-dst-file-test-v1.car") + err := ExtractV1File("testdata/sample-v1.car", dstPath) + require.Equal(t, ErrAlreadyV1, err) +} + +func generateRootCid(t *testing.T, adder format.NodeAdder) []cid.Cid { + // TODO convert this into a utility testing lib that takes an rng and generates a random DAG with some threshold for depth/breadth. + this := merkledag.NewRawNode([]byte("fish")) + that := merkledag.NewRawNode([]byte("lobster")) + other := merkledag.NewRawNode([]byte("🌊")) + + one := &merkledag.ProtoNode{} + assertAddNodeLink(t, one, this, "fishmonger") + + another := &merkledag.ProtoNode{} + assertAddNodeLink(t, another, one, "barreleye") + assertAddNodeLink(t, another, that, "🐡") + + andAnother := &merkledag.ProtoNode{} + assertAddNodeLink(t, andAnother, another, "🍤") + + assertAddNodes(t, adder, this, that, other, one, another, andAnother) + return []cid.Cid{andAnother.Cid()} +} + +func assertAddNodeLink(t *testing.T, pn *merkledag.ProtoNode, fn format.Node, name string) { + assert.NoError(t, pn.AddNodeLink(name, fn)) +} + +func assertAddNodes(t *testing.T, adder format.NodeAdder, nds ...format.Node) { + for _, nd := range nds { + assert.NoError(t, adder.Add(context.Background(), nd)) + } +} + +func TestReplaceRootsInFile(t *testing.T) { + tests := []struct { + name string + path string + roots []cid.Cid + wantErrMsg string + }{ + { + name: "CorruptPragmaIsRejected", + path: "testdata/sample-corrupt-pragma.car", + wantErrMsg: "unexpected EOF", + }, + { + name: "CARv42IsRejected", + path: "testdata/sample-rootless-v42.car", + wantErrMsg: "invalid car version: 42", + }, + { + name: "CARv1RootsOfDifferentSizeAreNotReplaced", + path: "testdata/sample-v1.car", + wantErrMsg: "current header size (61) must match replacement header size (18)", + }, + { + name: "CARv2RootsOfDifferentSizeAreNotReplaced", + path: "testdata/sample-wrapped-v2.car", + wantErrMsg: "current header size (61) must match replacement header size (18)", + }, + { + name: "CARv1NonEmptyRootsOfDifferentSizeAreNotReplaced", + path: "testdata/sample-v1.car", + roots: []cid.Cid{requireDecodedCid(t, "QmdfTbBqBPQ7VNxZEYEj14VmRuZBkqFbiwReogJgS1zR1n")}, + wantErrMsg: "current header size (61) must match replacement header size (57)", + }, + { + name: "CARv1ZeroLenNonEmptyRootsOfDifferentSizeAreNotReplaced", + path: "testdata/sample-v1-with-zero-len-section.car", + roots: []cid.Cid{merkledag.NewRawNode([]byte("fish")).Cid()}, + wantErrMsg: "current header size (61) must match replacement header size (59)", + }, + { + name: "CARv2NonEmptyRootsOfDifferentSizeAreNotReplaced", + path: "testdata/sample-wrapped-v2.car", + roots: []cid.Cid{merkledag.NewRawNode([]byte("fish")).Cid()}, + wantErrMsg: "current header size (61) must match replacement header size (59)", + }, + { + name: "CARv2IndexlessNonEmptyRootsOfDifferentSizeAreNotReplaced", + path: "testdata/sample-v2-indexless.car", + roots: []cid.Cid{merkledag.NewRawNode([]byte("fish")).Cid()}, + wantErrMsg: "current header size (61) must match replacement header size (59)", + }, + { + name: "CARv1SameSizeRootsAreReplaced", + path: "testdata/sample-v1.car", + roots: []cid.Cid{requireDecodedCid(t, "bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5od")}, + }, + { + name: "CARv2SameSizeRootsAreReplaced", + path: "testdata/sample-wrapped-v2.car", + roots: []cid.Cid{requireDecodedCid(t, "bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oi")}, + }, + { + name: "CARv2IndexlessSameSizeRootsAreReplaced", + path: "testdata/sample-v2-indexless.car", + roots: []cid.Cid{requireDecodedCid(t, "bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5oi")}, + }, + { + name: "CARv1ZeroLenSameSizeRootsAreReplaced", + path: "testdata/sample-v1-with-zero-len-section.car", + roots: []cid.Cid{requireDecodedCid(t, "bafy2bzaced4ueelaegfs5fqu4tzsh6ywbbpfk3cxppupmxfdhbpbhzawfw5o5")}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Make a copy of input files to preserve original for comparison. + // This also avoids modification files in testdata. + tmpCopy := requireTmpCopy(t, tt.path) + err := ReplaceRootsInFile(tmpCopy, tt.roots) + if tt.wantErrMsg != "" { + require.EqualError(t, err, tt.wantErrMsg) + return + } + require.NoError(t, err) + + original, err := os.Open(tt.path) + require.NoError(t, err) + defer func() { require.NoError(t, original.Close()) }() + + target, err := os.Open(tmpCopy) + require.NoError(t, err) + defer func() { require.NoError(t, target.Close()) }() + + // Assert file size has not changed. + wantStat, err := original.Stat() + require.NoError(t, err) + gotStat, err := target.Stat() + require.NoError(t, err) + require.Equal(t, wantStat.Size(), gotStat.Size()) + + wantReader, err := NewBlockReader(original, ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + gotReader, err := NewBlockReader(target, ZeroLengthSectionAsEOF(true)) + require.NoError(t, err) + + // Assert roots are replaced. + require.Equal(t, tt.roots, gotReader.Roots) + + // Assert data blocks are identical. + for { + wantNext, wantErr := wantReader.Next() + gotNext, gotErr := gotReader.Next() + if wantErr == io.EOF { + require.Equal(t, io.EOF, gotErr) + break + } + require.NoError(t, wantErr) + require.NoError(t, gotErr) + require.Equal(t, wantNext, gotNext) + } + }) + } +} + +func requireDecodedCid(t *testing.T, s string) cid.Cid { + decoded, err := cid.Decode(s) + require.NoError(t, err) + return decoded +} + +func requireTmpCopy(t *testing.T, src string) string { + srcF, err := os.Open(src) + require.NoError(t, err) + defer func() { require.NoError(t, srcF.Close()) }() + stats, err := srcF.Stat() + require.NoError(t, err) + + dst := filepath.Join(t.TempDir(), stats.Name()) + dstF, err := os.Create(dst) + require.NoError(t, err) + defer func() { require.NoError(t, dstF.Close()) }() + + _, err = io.Copy(dstF, srcF) + require.NoError(t, err) + return dst +}