GithubDataScraper/GithubDataScraper.py

209 lines
7.4 KiB
Python
Raw Normal View History

2020-09-10 21:38:27 +00:00
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import requests
from requests_oauthlib import OAuth2Session
import os
import json
import pandas as pd
import time
from datetime import datetime
# %%
MAXTIME = 365
RESULT_FOLDER = "results"
2020-11-04 16:02:29 +00:00
# These two informations are needed so the OAuth-Session can be established
CLIENTID = ""
TOKEN = ""
2020-09-10 21:38:27 +00:00
FILE = "decentral_communication_protocols.csv"
# %%
def getGithubOAuthSession():
token = {
'access_token': TOKEN
}
return OAuth2Session(CLIENTID, token=token)
# %%
def createCSV(pathToFile, list):
df = pd.DataFrame(list)
df.to_csv(pathToFile)
# %%
def getCommits(repoName, oAuthSession):
commitList = []
devList = []
headers = {'Accept': 'application/vnd.github.mercy-preview+json'}
commitsResponseJson = None
i = 0
commitsTooOld = False
while commitsResponseJson is None or len(commitsResponseJson) > 0:
commitsResponse = oAuthSession.get('https://api.github.com/repos/' + repoName + '/commits?page=' + str(i), headers=headers)
if commitsResponse.status_code == 200:
commitsResponseJson = commitsResponse.json()
i = i + 1
for commit in commitsResponseJson:
if MAXTIME is not None and (datetime.now() - datetime.strptime(commit['commit']['author']['date'], '%Y-%m-%dT%H:%M:%SZ')).days > MAXTIME:
commitsTooOld = True
break
committer = commit['committer']
if committer is not None and "login" in committer.keys() and committer['login'] not in devList:
devList.append(committer['login'])
commitList.append(commit)
if commitsTooOld:
break
elif commitsResponse.status_code == 409: # empty repo
break
else:
time.sleep(3600)
print("Error in Http Request (commitsRequest):", commitsResponse.status_code, commitsResponse.text)
return commitList, devList
# %%
def getIssues(repoName, oAuthSession):
issueList = []
devList = []
headers = {'Accept': 'application/vnd.github.mercy-preview+json'}
issuesResponseJson = None
i = 0
issuesTooOld = False
while issuesResponseJson is None or len(issuesResponseJson) > 0:
issuesResponse = oAuthSession.get('https://api.github.com/repos/' + repoName + '/issues?state=all&page=' + str(i), headers=headers)
if issuesResponse.status_code == 200:
i = i + 1
issuesResponseJson = issuesResponse.json()
for issue in issuesResponseJson:
if MAXTIME is not None and (datetime.now() - datetime.strptime(issue['created_at'], '%Y-%m-%dT%H:%M:%SZ')).days > MAXTIME:
issuesTooOld = True
break
assignee = issue['assignee']
assignees = issue['assignees']
if assignee is not None and 'login' in assignee.keys() and assignee['login'] not in devList:
devList.append(assignee['login'])
for assignee in assignees:
if 'login' in assignee.keys() and assignee['login'] not in devList:
devList.append(assignee['login'])
issueList.append(issue)
if issuesTooOld:
break
elif issuesResponse.status_code == 409: # empty repo
break
else:
print("Error in Http Request (issuesRequest):", issuesResponse.status_code, issuesResponse.text)
time.sleep(3600)
return issueList, devList
# %%
def getRepoFacts(repoName, oAuthSession):
devList = []
issueList = []
commitList = []
issueList, devListIssues = getIssues(repoName, oAuthSession)
commitList, devListCommits = getCommits(repoName, oAuthSession)
devList.append(devListIssues)
for dev in devListCommits:
if dev not in devList:
devList.append(dev)
return issueList, commitList, devList
# %%
def getTopicFacts(topicName):
amountRepos = 0
amountStars = 0
amountIssues = 0
amountCommits = 0
amountDevs = 0
repoNameList = []
repoList = []
issueList = []
commitList = []
devList = []
oAuthSession = getGithubOAuthSession()
headers = {'Accept': 'application/vnd.github.mercy-preview+json'}
topicResponse = oAuthSession.get('https://api.github.com/search/repositories?q=topic:' + topicName, headers=headers)
if topicResponse.status_code == 200:
topicResponseJson = topicResponse.json()
i = 0
while "items" in topicResponseJson.keys() and len(topicResponseJson['items']) > 0:
i = i + 1
for repo in topicResponseJson['items']:
print(repo['full_name'])
if repo['full_name'] not in repoNameList:
issueListRepo, commitListRepo, devListRepo = getRepoFacts(repo['full_name'], oAuthSession)
for issue in issueListRepo:
issueList.append(issue)
for commit in commitListRepo:
commitList.append(commit)
for dev in devListRepo:
if dev not in devList:
devList.append(dev)
amountStars = amountStars + int(repo['stargazers_count'])
repoNameList.append(repo['full_name'])
repoList.append(repo)
topicResponseJson = oAuthSession.get('https://api.github.com/search/repositories?q=topic:' + topicName + '&page=' + str(i), headers=headers).json()
createCSV(topicName + "_repos.csv", repoList)
createCSV(topicName + "_issues.csv", issueList)
createCSV(topicName + "_commits.csv", commitList)
amountRepos = len(repoList)
amountDevs = len(devList)
amountIssues = len(issueList)
amountCommits = len(commitList)
print("Amount Repos:", str(amountRepos))
print("Amount Stars:", str(amountStars))
print("Amount Issues:", str(amountIssues))
print("Amount Commits:", str(amountCommits))
print("Amount Devs:", str(amountDevs))
else:
print("Error in Http Request (topicRequest):", topicResponse.status_code, topicResponse.text)
return amountRepos, amountStars, amountIssues, amountCommits, amountDevs
# %%
def main():
amountReposList = []
amountStarsList = []
amountIssuesList = []
amountCommitsList = []
amountDevsList = []
#result_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), RESULT_FOLDER)
#if not os.path.exists(result_path):
# os.makedirs(result_path)
protocols = pd.read_csv(FILE)
for i in range(len(protocols)):
print("Protocol:", protocols["Github Tag"][i])
amountRepos, amountStars, amountIssues, amountCommits, amountDevs = getTopicFacts(protocols["Github Tag"][i])
amountReposList.append(amountRepos)
amountStarsList.append(amountStars)
amountIssuesList.append(amountIssues)
amountCommitsList.append(amountCommits)
amountDevsList.append(amountDevs)
protocols.assign(**{
'amountRepos':amountReposList,
'amountStars':amountStarsList,
'amountIssues':amountIssuesList,
'amountCommits':amountCommitsList,
'amountDevs':amountDevsList
})
protocols.to_csv("new.csv")
# %%
if __name__ == "__main__":
print("Start")
main()
print("End")