Skip to content

Commit

Permalink
feat: 3626 add option enable license content; disable by default (#3631)
Browse files Browse the repository at this point in the history
---------
Signed-off-by: Christopher Phillips <[email protected]>
  • Loading branch information
spiffcs authored Feb 5, 2025
1 parent 7bab6e9 commit e584c9f
Show file tree
Hide file tree
Showing 25 changed files with 418 additions and 80 deletions.
10 changes: 10 additions & 0 deletions cmd/syft/internal/options/catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type Catalog struct {
DefaultCatalogers []string `yaml:"default-catalogers" json:"default-catalogers" mapstructure:"default-catalogers"`
SelectCatalogers []string `yaml:"select-catalogers" json:"select-catalogers" mapstructure:"select-catalogers"`
Package packageConfig `yaml:"package" json:"package" mapstructure:"package"`
License licenseConfig `yaml:"license" json:"license" mapstructure:"license"`
File fileConfig `yaml:"file" json:"file" mapstructure:"file"`
Scope string `yaml:"scope" json:"scope" mapstructure:"scope"`
Parallelism int `yaml:"parallelism" json:"parallelism" mapstructure:"parallelism"` // the number of catalog workers to run in parallel
Expand Down Expand Up @@ -69,6 +70,7 @@ func DefaultCatalog() Catalog {
Compliance: defaultComplianceConfig(),
Scope: source.SquashedScope.String(),
Package: defaultPackageConfig(),
License: defaultLicenseConfig(),
LinuxKernel: defaultLinuxKernelConfig(),
Golang: defaultGolangConfig(),
Java: defaultJavaConfig(),
Expand All @@ -89,6 +91,7 @@ func (cfg Catalog) ToSBOMConfig(id clio.Identification) *syft.CreateSBOMConfig {
WithUnknownsConfig(cfg.ToUnknownsConfig()).
WithSearchConfig(cfg.ToSearchConfig()).
WithPackagesConfig(cfg.ToPackagesConfig()).
WithLicenseConfig(cfg.ToLicenseConfig()).
WithFilesConfig(cfg.ToFilesConfig()).
WithCatalogerSelection(
cataloging.NewSelectionRequest().
Expand Down Expand Up @@ -146,6 +149,13 @@ func (cfg Catalog) ToFilesConfig() filecataloging.Config {
}
}

func (cfg Catalog) ToLicenseConfig() cataloging.LicenseConfig {
return cataloging.LicenseConfig{
IncludeUnkownLicenseContent: cfg.License.IncludeUnknownLicenseContent,
Coverage: cfg.License.LicenseCoverage,
}
}

func (cfg Catalog) ToPackagesConfig() pkgcataloging.Config {
archiveSearch := cataloging.ArchiveSearchConfig{
IncludeIndexedArchives: cfg.Package.SearchIndexedArchives,
Expand Down
28 changes: 28 additions & 0 deletions cmd/syft/internal/options/license.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package options

import (
"github.com/anchore/clio"
)

type licenseConfig struct {
IncludeUnknownLicenseContent bool `yaml:"include-unknown-license-content" json:"include-unknown-license-content" mapstructure:"include-unknown-license-content"`
LicenseCoverage float64 `yaml:"license-coverage" json:"license-coverage" mapstructure:"license-coverage"`
}

var _ interface {
clio.FieldDescriber
} = (*licenseConfig)(nil)

func (o *licenseConfig) DescribeFields(descriptions clio.FieldDescriptionSet) {
descriptions.Add(&o.IncludeUnknownLicenseContent, `include the content of a license in the SBOM when syft
cannot determine a valid SPDX ID for the given license`)
descriptions.Add(&o.LicenseCoverage, `adjust the percent as a fraction of the total text, in normalized words, that
matches any valid license for the given inputs, expressed as a percentage across all of the licenses matched.`)
}

func defaultLicenseConfig() licenseConfig {
return licenseConfig{
IncludeUnknownLicenseContent: false,
LicenseCoverage: 75,
}
}
4 changes: 2 additions & 2 deletions internal/licenses/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ func SetContextLicenseScanner(ctx context.Context, s Scanner) context.Context {
return context.WithValue(ctx, licenseScannerKey{}, s)
}

func ContextLicenseScanner(ctx context.Context) Scanner {
func ContextLicenseScanner(ctx context.Context) (Scanner, error) {
if s, ok := ctx.Value(licenseScannerKey{}).(Scanner); ok {
return s
return s, nil
}
return NewDefaultScanner()
}
82 changes: 50 additions & 32 deletions internal/licenses/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,66 +2,84 @@ package licenses

import (
"context"
"fmt"
"io"

"github.com/google/licensecheck"

"github.com/anchore/syft/internal/log"
"github.com/anchore/syft/syft/file"
"github.com/anchore/syft/syft/pkg"
)

const coverageThreshold = 75 // determined by experimentation
const (
DefaultCoverageThreshold = 75 // determined by experimentation
DefaultIncludeLicenseContent = false
)

type Scanner interface {
IdentifyLicenseIDs(context.Context, io.Reader) ([]string, []byte, error)
FileSearch(context.Context, file.LocationReadCloser) ([]file.License, error)
PkgSearch(context.Context, file.LocationReadCloser) ([]pkg.License, error)
}

var _ Scanner = (*scanner)(nil)

type scanner struct {
coverageThreshold float64 // between 0 and 100
scanner func([]byte) licensecheck.Coverage
coverageThreshold float64 // between 0 and 100
includeLicenseContent bool
scanner func([]byte) licensecheck.Coverage
}

// NewDefaultScanner returns a scanner that uses a new instance of the default licensecheck package scanner.
func NewDefaultScanner() Scanner {
s, err := licensecheck.NewScanner(licensecheck.BuiltinLicenses())
if err != nil {
log.WithFields("error", err).Trace("unable to create default license scanner")
s = nil
}
return &scanner{
coverageThreshold: coverageThreshold,
scanner: s.Scan,
}
type ScannerConfig struct {
CoverageThreshold float64
IncludeLicenseContent bool
Scanner func([]byte) licensecheck.Coverage
}

func NewScanner(scan func([]byte) licensecheck.Coverage, coverage float64) Scanner {
return scanner{
coverageThreshold: coverage,
scanner: scan,
type Option func(*scanner)

func WithCoverage(coverage float64) Option {
return func(s *scanner) {
s.coverageThreshold = coverage
}
}

func (s scanner) IdentifyLicenseIDs(_ context.Context, reader io.Reader) ([]string, []byte, error) {
if s.scanner == nil {
return nil, nil, nil
func WithIncludeLicenseContent(includeLicenseContent bool) Option {
return func(s *scanner) {
s.includeLicenseContent = includeLicenseContent
}
}

content, err := io.ReadAll(reader)
// NewDefaultScanner returns a scanner that uses a new instance of the default licensecheck package scanner.
func NewDefaultScanner(o ...Option) (Scanner, error) {
s, err := licensecheck.NewScanner(licensecheck.BuiltinLicenses())
if err != nil {
return nil, nil, err
log.WithFields("error", err).Trace("unable to create default license scanner")
return nil, fmt.Errorf("unable to create default license scanner: %w", err)
}
newScanner := &scanner{
coverageThreshold: DefaultCoverageThreshold,
includeLicenseContent: DefaultIncludeLicenseContent,
scanner: s.Scan,
}

cov := s.scanner(content)
if cov.Percent < s.coverageThreshold {
// unknown or no licenses here?
// => return binary content
return nil, content, nil
for _, opt := range o {
opt(newScanner)
}
return newScanner, nil
}

var ids []string
for _, m := range cov.Match {
ids = append(ids, m.ID)
// NewScanner generates a license Scanner with the given ScannerConfig
// if config is nil NewDefaultScanner is used
func NewScanner(c *ScannerConfig) (Scanner, error) {
if c == nil {
return NewDefaultScanner()
}
return ids, nil, nil

return &scanner{
coverageThreshold: c.CoverageThreshold,
includeLicenseContent: c.IncludeLicenseContent,
scanner: c.Scanner,
}, nil
}
13 changes: 7 additions & 6 deletions internal/licenses/scanner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ func TestIdentifyLicenseIDs(t *testing.T) {
expected: expectation{
yieldError: false,
ids: []string{"Apache-2.0"},
content: []byte{},
content: nil,
},
},
{
name: "custom license",
name: "custom license includes content for IdentifyLicenseIDs",
in: "test-fixtures/nvidia-software-and-cuda-supplement",
expected: expectation{
yieldError: false,
Expand All @@ -45,7 +45,7 @@ func TestIdentifyLicenseIDs(t *testing.T) {
t.Run(test.name, func(t *testing.T) {
content, err := os.ReadFile(test.in)
require.NoError(t, err)
ids, content, err := testScanner().IdentifyLicenseIDs(context.TODO(), bytes.NewReader(content))
ids, content, err := testScanner(false).IdentifyLicenseIDs(context.TODO(), bytes.NewReader(content))
if test.expected.yieldError {
require.Error(t, err)
} else {
Expand All @@ -66,10 +66,11 @@ func TestIdentifyLicenseIDs(t *testing.T) {
}
}

func testScanner() Scanner {
func testScanner(includeLicenseContent bool) Scanner {
return &scanner{
coverageThreshold: coverageThreshold,
scanner: licensecheck.Scan,
coverageThreshold: DefaultCoverageThreshold,
includeLicenseContent: includeLicenseContent,
scanner: licensecheck.Scan,
}
}

Expand Down
73 changes: 69 additions & 4 deletions internal/licenses/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"crypto/sha256"
"fmt"
"io"
"strings"

"github.com/anchore/syft/syft/file"
Expand All @@ -21,11 +22,35 @@ func getCustomLicenseContentHash(contents []byte) string {
return fmt.Sprintf("%x", hash[:])
}

// Search scans the contents of a license file to attempt to determine the type of license it is
func Search(ctx context.Context, scanner Scanner, reader file.LocationReadCloser) (licenses []pkg.License, err error) {
func (s *scanner) IdentifyLicenseIDs(_ context.Context, reader io.Reader) ([]string, []byte, error) {
if s.scanner == nil {
return nil, nil, nil
}

content, err := io.ReadAll(reader)
if err != nil {
return nil, nil, err
}

cov := s.scanner(content)
if cov.Percent < s.coverageThreshold {
// unknown or no licenses here
// => check return content to Search to process
return nil, content, nil
}

var ids []string
for _, m := range cov.Match {
ids = append(ids, m.ID)
}
return ids, nil, nil
}

// PkgSearch scans the contents of a license file to attempt to determine the type of license it is
func (s *scanner) PkgSearch(ctx context.Context, reader file.LocationReadCloser) (licenses []pkg.License, err error) {
licenses = make([]pkg.License, 0)

ids, content, err := scanner.IdentifyLicenseIDs(ctx, reader)
ids, content, err := s.IdentifyLicenseIDs(ctx, reader)
if err != nil {
return nil, err
}
Expand All @@ -48,7 +73,47 @@ func Search(ctx context.Context, scanner Scanner, reader file.LocationReadCloser

lic := pkg.NewLicenseFromLocations(unknownLicenseType, reader.Location)
lic.SPDXExpression = UnknownLicensePrefix + getCustomLicenseContentHash(content)
lic.Contents = string(content)
if s.includeLicenseContent {
lic.Contents = string(content)
}
lic.Type = license.Declared

licenses = append(licenses, lic)
}

return licenses, nil
}

// FileSearch scans the contents of a license file to attempt to determine the type of license it is
func (s *scanner) FileSearch(ctx context.Context, reader file.LocationReadCloser) (licenses []file.License, err error) {
licenses = make([]file.License, 0)

ids, content, err := s.IdentifyLicenseIDs(ctx, reader)
if err != nil {
return nil, err
}

// IdentifyLicenseIDs can only return a list of ID or content
// These return values are mutually exclusive.
// If the scanner threshold for matching scores < 75% then we return the license full content
if len(ids) > 0 {
for _, id := range ids {
lic := file.NewLicense(id)
lic.Type = license.Concluded

licenses = append(licenses, lic)
}
} else if len(content) > 0 {
// harmonize line endings to unix compatible first:
// 1. \r\n => \n (Windows => UNIX)
// 2. \r => \n (Macintosh => UNIX)
content = []byte(strings.ReplaceAll(strings.ReplaceAll(string(content), "\r\n", "\n"), "\r", "\n"))

lic := file.NewLicense(unknownLicenseType)
lic.SPDXExpression = UnknownLicensePrefix + getCustomLicenseContentHash(content)
if s.includeLicenseContent {
lic.Contents = string(content)
}
lic.Type = license.Declared

licenses = append(licenses, lic)
Expand Down
Loading

0 comments on commit e584c9f

Please sign in to comment.