Skip to content

Commit

Permalink
reconnect int models to new schema and naming conventions (opensource…
Browse files Browse the repository at this point in the history
…-observer#1363)

* refactor: int models in directory

* fix artifact_name references

* fix: naming for repos by project table

* big fixes to artifact source and namespace fields

* fix: int events union

* fix linting error

* (broken) fixes to namespaces and event source in int models

* fix invocation of macro

* complete int model refactor

* remove log file

---------

Co-authored-by: Reuven V. Gonzales <[email protected]>
  • Loading branch information
ccerv1 and ravenac95 authored May 7, 2024
1 parent 83ef162 commit 7264214
Show file tree
Hide file tree
Showing 85 changed files with 999 additions and 1,694 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ data/
*.env
.env*.local
*.log
logs/
coverage.json

# typescript
Expand Down
67 changes: 30 additions & 37 deletions warehouse/dbt/models/intermediate/directory/int_artifacts.sql
Original file line number Diff line number Diff line change
@@ -1,47 +1,40 @@
with ossd_artifacts as (
select distinct
artifact_source_id,
artifact_namespace,
artifact_type,
artifact_url,
LOWER(artifact_name) as artifact_name
from {{ ref('stg_ossd__artifacts_by_project') }}
),

from_artifacts as (
{# `from` actor artifacts derived from all events #}
select
from_source_id as artifact_source_id,
from_namespace as artifact_namespace,
from_type as artifact_type,
"" as artifact_url, {# for now this is blank #}
LOWER(from_name) as artifact_name,
MAX(e.time) as last_used
from {{ ref('int_events') }} as e
group by 1, 2, 3, 4, 5
),

all_artifacts as (
with all_artifacts as (
{#
The `last_used` value is later used in this query to determine what the most
_current_ name is. However, oss-directory names are considered canonical so
we will use those by setting `last_used` to be the current timestamp.
`last_used` is only relevent for `git_user` artifacts.
#}
select
oa.*,
CURRENT_TIMESTAMP() as last_used
from ossd_artifacts as oa
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_url,
artifact_name
from {{ ref('int_ossd__artifacts_by_project') }}
union all
select * from from_artifacts
select
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_url,
MAX_BY(artifact_name, last_used) as artifact_name
from {{ ref('int_artifacts_history') }}
group by
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_url
)

select
select distinct
{{ oso_artifact_id("artifact") }} as artifact_id,
artifact_source_id as artifact_source_id,
artifact_namespace as artifact_namespace,
artifact_type as artifact_type,
artifact_url as artifact_url,
TO_JSON(ARRAY_AGG(distinct artifact_name)) as artifact_names,
MAX_BY(artifact_name, last_used) as artifact_latest_name
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_name,
artifact_url
from all_artifacts
group by 1, 2, 3, 4, 5
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{#
Currently this only captures the history of git_users.
It does not capture git_repo naming histories.
#}

with git_user_events as (
{# `from` actor artifacts derived from all events #}
select
event_source as artifact_source,
from_artifact_source_id as artifact_source_id,
from_artifact_type as artifact_type,
from_artifact_namespace as artifact_namespace,
from_artifact_name as artifact_name,
"" as artifact_url,
time
from {{ ref('int_events') }}
)

select
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_url,
artifact_name,
MAX(time) as last_used,
MIN(time) as first_used
from git_user_events
group by
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_url,
artifact_name
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
{#
This model is responsible for generating a list of all artifacts associated with a project.
This includes repositories, npm packages, blockchain addresses, and contracts.

Currently, the source and namespace for blockchain artifacts are the same. This may change
in the future.
#}

with all_repos as (
select
"GITHUB" as artifact_source,
"REPOSITORY" as artifact_type,
repos.project_id as project_id,
repos.owner as artifact_namespace,
repos.name_with_owner as artifact_name,
repos.url as artifact_url,
CAST(repos.id as STRING) as artifact_source_id
from {{ ref('int_ossd__repositories_by_project') }} as repos
),

all_npm_raw as (
select
"NPM" as artifact_source,
"PACKAGE" as artifact_type,
projects.project_id,
JSON_VALUE(npm.url) as artifact_source_id,
case
when
JSON_VALUE(npm.url) like "https://npmjs.com/package/%"
then SUBSTR(JSON_VALUE(npm.url), 28)
when
JSON_VALUE(npm.url) like "https://www.npmjs.com/package/%"
then SUBSTR(JSON_VALUE(npm.url), 31)
end as artifact_name,
JSON_VALUE(npm.url) as artifact_url
from
{{ ref('stg_ossd__current_projects') }} as projects
cross join
UNNEST(JSON_QUERY_ARRAY(projects.npm)) as npm
),

all_npm as (
select
project_id,
artifact_source_id,
artifact_source,
artifact_type,
artifact_name,
artifact_url,
SPLIT(REPLACE(artifact_name, "@", ""), "/")[SAFE_OFFSET(0)]
as artifact_namespace
from all_npm_raw
),

ossd_blockchain as (
select
projects.project_id,
tag as artifact_type,
network as artifact_namespace,
network as artifact_source,
JSON_VALUE(blockchains.address) as artifact_source_id,
JSON_VALUE(blockchains.address) as artifact_name,
JSON_VALUE(blockchains.address) as artifact_url
from
{{ ref('stg_ossd__current_projects') }} as projects
cross join
UNNEST(JSON_QUERY_ARRAY(projects.blockchain)) as blockchains
cross join
UNNEST(JSON_VALUE_ARRAY(blockchains.networks)) as network
cross join
UNNEST(JSON_VALUE_ARRAY(blockchains.tags)) as tag
),

all_deployers as (
select
*,
"OPTIMISM" as artifact_namespace,
"OPTIMISM" as artifact_source
from {{ ref("stg_optimism__deployers") }}
union all
select
*,
"MAINNET" as artifact_namespace,
"MAINNET" as artifact_source
from {{ ref("stg_ethereum__deployers") }}
union all
select
*,
"ARBITRUM_ONE" as artifact_namespace,
"ARBITRUM_ONE" as artifact_source
from {{ ref("stg_arbitrum__deployers") }}
),

discovered_contracts as (
select
"CONTRACT" as artifact_type,
ob.project_id,
ad.contract_address as artifact_source_id,
ob.artifact_namespace,
ob.artifact_namespace as artifact_source,
ad.contract_address as artifact_name,
ad.contract_address as artifact_url
from ossd_blockchain as ob
inner join all_deployers as ad
on
ob.artifact_source_id = ad.deployer_address
and ob.artifact_namespace = ad.artifact_namespace
and ob.artifact_type in ("EOA", "DEPLOYER", "FACTORY")
),

all_artifacts as (
select
project_id,
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_name,
artifact_url
from
all_repos
union all
select
project_id,
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_name,
artifact_url
from
ossd_blockchain
union all
select
project_id,
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_name,
artifact_url
from
discovered_contracts
union all
select
project_id,
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_name,
artifact_url
from
all_npm
),

all_unique_artifacts as (
select distinct
project_id,
LOWER(artifact_source_id) as artifact_source_id,
UPPER(artifact_source) as artifact_source,
UPPER(artifact_type) as artifact_type,
UPPER(artifact_namespace) as artifact_namespace,
LOWER(artifact_name) as artifact_name,
LOWER(artifact_url) as artifact_url
from all_artifacts
)

select
project_id,
artifact_source_id,
artifact_source,
artifact_type,
artifact_namespace,
artifact_name,
artifact_url,
{{ oso_artifact_id("artifact", "a") }} as `artifact_id`
from all_unique_artifacts as a
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ select
from {{ ref('stg_ossd__current_collections') }} as collections
cross join UNNEST(collections.projects) as project_name
inner join {{ ref('stg_ossd__current_projects') }} as projects
on projects.name = project_name
on projects.project_name = project_name
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ select
repos.*,
projects.project_id,
"GITHUB" as repository_source,
{{ oso_id("'GITHUB'", "'GIT_REPOSITORY'", "CAST(repos.id AS STRING)") }}
{{ oso_id("'GITHUB'", "'REPOSITORY'", "CAST(repos.id AS STRING)") }}
as artifact_id
from
{{ ref('stg_ossd__current_projects') }} as projects
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ with ranked_repos as (
COUNT(distinct owner)
over (partition by project_id)
as count_github_owners
from {{ ref('stg_ossd__repositories_by_project') }}
from {{ ref('int_ossd__repositories_by_project') }}
)

select
Expand Down
4 changes: 2 additions & 2 deletions warehouse/dbt/models/intermediate/directory/int_projects.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ with ranked_repos as (
COUNT(distinct owner)
over (partition by project_id)
as github_owners_count
from {{ ref('stg_ossd__repositories_by_project') }}
from {{ ref('int_ossd__repositories_by_project') }}
),

project_owners as (
Expand Down Expand Up @@ -39,4 +39,4 @@ select
as npm_artifact_count
from {{ ref('stg_ossd__current_projects') }} as projects
left join project_owners as project_owners
on projects.id = project_owners.project_id
on projects.project_id = project_owners.project_id
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ select
projects.project_source,
projects.project_namespace,
projects.project_name
from {{ ref('stg_ossd__projects_by_collection') }} as projects_by_collection
from {{ ref('int_ossd__projects_by_collection') }} as projects_by_collection
inner join {{ ref('stg_ossd__current_projects') }} as projects
on projects_by_collection.project_id = projects.id
on projects_by_collection.project_id = projects.project_id
Loading

0 comments on commit 7264214

Please sign in to comment.