-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_ec_repos.py
152 lines (123 loc) · 4.94 KB
/
get_ec_repos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import toml
import re
import os
import csv
from github import Auth, Github, Repository
import requests
import pandas as pd
from datetime import datetime,date
import time
import subprocess
# Load in from GH organizations
api_key = os.getenv('GITHUB_API_KEY')
headers = {'Authorization': f'token {api_key}'}
# update date
curr_date = str(date.today())
# Directory
directory = os.getcwd()
ec_directory = directory + '/crypto-ecosystems'
# Define Functions
def get_ec_repo(ec_directory):
if os.path.exists(ec_directory):
return # Already Downloaded Repo
else:
try:
subprocess.run(["git", "clone", "https://github.com/electric-capital/crypto-ecosystems.git", ec_directory], check=True)
except subprocess.CalledProcessError as e:
print(f"Error: {e}")
# Fetch latest from GitHub
def pull_latest_from_github(repo_path):
# Navigate to the repository directory and pull the latest changes
try:
subprocess.run(f"cd {repo_path} && git checkout master && git pull", check=True, shell=True)
except subprocess.CalledProcessError as e:
subprocess.run(f"cd {repo_path} && git stash && git checkout master && git pull origin master", check=True, shell=True)
# Define functions
def replace_name(name):
name = re.sub(r'\W+', '-', name).lower()
return name[:-1] if name.endswith('-') else name
def find_file(file,ec_directory):
file = replace_name(file)+'.toml'
foldername=file[0]
for root, dirs, files in os.walk(ec_directory):
if os.path.basename(root) == foldername:
if file in files:
return os.path.join(root, file)
return None
def load_repos(repo_master,toml):
for i in toml['repo']:
repo_master.append(i['url'])
return list(set(repo_master))
def load_subecosystems(sub_master,org_master,toml):
for i in toml['sub_ecosystems']:
sub_master.append(i)
for i in toml['github_organizations']:
org_master.append(i)
return list(set(sub_master)),list(set(org_master))
def get_org_repos(org):
org_name = org.split('/')[-1]
api_request = f"https://api.github.com/orgs/{org_name}/repos"
# print(api_request)
req = requests.get(api_request,headers=headers)
while req.status_code == 403:
print("403 Error")
time.sleep(60)
req = requests.get(api_request)
data = req.json()
# time.sleep(1)
return [r['html_url'] for r in data if 'html_url' in r]
# for each sub-ecosystem, 1) load their file 2) note sub-ecosystems, 3) add repos, 4) Add organizations
def process_sub_ecosystem(sub_ecosystem,near_repos_master,sub_ecosystems_master,org_master):
filepath = find_file(sub_ecosystem,f'{ec_directory}/data/ecosystems')
# 1. Load toml file
toml_data = toml.load(filepath)
# 2. Load subecosystems
sub_ecosystems_master,org_master = load_subecosystems(sub_ecosystems_master,org_master,toml_data)
# 3. Load Repos
near_repos_master = load_repos(near_repos_master,toml_data)
near_repos_master.sort()
return near_repos_master,sub_ecosystems_master,org_master
# Load the repos from org master into repo_master
def load_organization_repos(repo_master,org_master):
for i in org_master:
repo_master.extend(get_org_repos(i))
return list(set(repo_master))
def pull_ec_near_repos():
# Check if EC Loaded
get_ec_repo(ec_directory)
# Load in latest from EC File
pull_latest_from_github(ec_directory)
# Load the near toml file
near_toml_path = ec_directory+'/data/ecosystems/n/near.toml'
near_toml = toml.load(near_toml_path)
# first load NEAR Repos
near_repos_master = []
sub_ecosystems_master = []
org_master = []
near_repos_master = load_repos(near_repos_master,near_toml)
sub_ecosystems_master,org_master = load_subecosystems(sub_ecosystems_master,org_master,near_toml)
# Loop through sub-ecosystems
change = 1
while change != 0:
prev_repos = len(near_repos_master)
for sub in sub_ecosystems_master:
near_repos_master,sub_ecosystems_master,org_master = process_sub_ecosystem(sub,near_repos_master,sub_ecosystems_master,org_master)
change = len(near_repos_master) - prev_repos
print(f"Added {change} to repo master")
# Load repos into near_repos_master
near_repos_master = load_organization_repos(near_repos_master,org_master)
# Sort the repos
near_repos_master.sort()
# Save as toml file
data = {"repo":near_repos_master}
with open(f'{directory}/all_near_repos_{curr_date}.toml','w') as toml_file:
toml.dump(data,toml_file)
# Save as csv file
with open(f'{directory}/all_near_repos_{curr_date}.csv', 'w', newline='') as csv_file:
writer = csv.writer(csv_file)
for repo in near_repos_master:
url = repo.split('.com/')[1]
writer.writerow([url])
print(f"Saved {len(near_repos_master)} NEAR repositories to all_near_repos_{curr_date}.csv")
if __name__ == "__main__":
main()