import json
from typing import (
Optional,
Any,
List,
TypeVar,
Type,
cast,
Callable,
Sequence,
Tuple,
Union,
)
from datetime import datetime
import pandas as pd
import dateutil.parser
import requests
from gharchive.unzip import decompress
T = TypeVar("T")
[docs]def from_int(x: Any) -> int:
assert isinstance(x, int) and not isinstance(x, bool)
return x
[docs]def from_none(x: Any) -> Any:
assert x is None
return x
[docs]def from_union(fs, x):
for f in fs:
try:
return f(x)
except:
pass
assert False
[docs]def from_str(x: Any) -> str:
assert isinstance(x, str)
return x
[docs]def from_bool(x: Any) -> bool:
assert isinstance(x, bool)
return x
[docs]def to_class(c: Type[T], x: Any) -> dict:
assert isinstance(x, c)
return cast(Any, x).to_dict()
[docs]def from_list(f: Callable[[Any], T], x: Any) -> List[T]:
assert isinstance(x, list)
return [f(y) for y in x]
[docs]def from_datetime(x: Any) -> datetime:
return dateutil.parser.parse(x)
# TODO [#1]: support Timeline API format
#
# Any records from 2/12/2011-12/31/2014 were from the deprecated Timeline API
# and so come in a different format. Need to parse them into the same models.
# Currently the code will fail to parse any responses from this time period.
# There is already a 2012-06-14-15.json.gz in the test data which triggers this.
# Also update gharchive.search.SUPPORTED_BEGIN_YEAR after this and add constraints
# for not being earlier in 2011.
[docs]class SeriesSerializable:
_series_nested_attrs: Sequence[str] = tuple()
[docs] def to_dict(self) -> dict:
raise NotImplementedError
[docs] def to_series(self) -> pd.Series:
data = self.to_dict()
_extract_nested_attrs(self, data, self._series_nested_attrs)
return pd.Series(data)
def _extract_nested_attrs(
obj: SeriesSerializable, data: dict, nested_attrs: Sequence[str]
):
for nested_attr in nested_attrs:
del data[nested_attr]
data_model = getattr(obj, nested_attr)
if data_model is None:
# Cannot split element as is None
continue
if isinstance(data_model, (list, tuple)):
new_data = {}
for i, elem in enumerate(data_model):
temp_data = elem.to_dict()
_extract_nested_attrs(elem, temp_data, elem._series_nested_attrs)
new_data.update(
{
f"{nested_attr}_{key}_{i + 1}": value
for key, value in temp_data.items()
}
)
else:
new_data = data_model.to_dict()
_extract_nested_attrs(data_model, new_data, data_model._series_nested_attrs)
new_data = {
f"{nested_attr}_{key}": value for key, value in new_data.items()
}
data.update(new_data)
[docs]class Actor(SeriesSerializable):
id: Optional[int]
login: Optional[str]
display_login: Optional[str]
gravatar_id: Optional[str]
url: Optional[str]
avatar_url: Optional[str]
[docs] def __init__(
self,
id: Optional[int],
login: Optional[str],
display_login: Optional[str],
gravatar_id: Optional[str],
url: Optional[str],
avatar_url: Optional[str],
) -> None:
self.id = id
self.login = login
self.display_login = display_login
self.gravatar_id = gravatar_id
self.url = url
self.avatar_url = avatar_url
[docs] @staticmethod
def from_dict(obj: Any) -> "Actor":
assert isinstance(obj, dict)
id = from_union([from_int, from_none], obj.get("id"))
login = from_union([from_str, from_none], obj.get("login"))
display_login = from_union([from_str, from_none], obj.get("display_login"))
gravatar_id = from_union([from_str, from_none], obj.get("gravatar_id"))
url = from_union([from_str, from_none], obj.get("url"))
avatar_url = from_union([from_str, from_none], obj.get("avatar_url"))
return Actor(id, login, display_login, gravatar_id, url, avatar_url)
[docs] def to_dict(self) -> dict:
result: dict = {}
result["id"] = from_union([from_int, from_none], self.id)
result["login"] = from_union([from_str, from_none], self.login)
result["display_login"] = from_union([from_str, from_none], self.display_login)
result["gravatar_id"] = from_union([from_str, from_none], self.gravatar_id)
result["url"] = from_union([from_str, from_none], self.url)
result["avatar_url"] = from_union([from_str, from_none], self.avatar_url)
return result
[docs]class Author(SeriesSerializable):
name: Optional[str]
email: Optional[str]
[docs] def __init__(self, name: Optional[str], email: Optional[str]) -> None:
self.name = name
self.email = email
[docs] @staticmethod
def from_dict(obj: Any) -> "Author":
assert isinstance(obj, dict)
name = from_union([from_str, from_none], obj.get("name"))
email = from_union([from_str, from_none], obj.get("email"))
return Author(name, email)
[docs] def to_dict(self) -> dict:
result: dict = {}
result["name"] = from_union([from_str, from_none], self.name)
result["email"] = from_union([from_str, from_none], self.email)
return result
[docs]class Commit(SeriesSerializable):
_series_nested_attrs = ("author",)
sha: Optional[str]
author: Optional[Author]
message: Optional[str]
distinct: Optional[bool]
url: Optional[str]
[docs] def __init__(
self,
sha: Optional[str],
author: Optional[Author],
message: Optional[str],
distinct: Optional[bool],
url: Optional[str],
) -> None:
self.sha = sha
self.author = author
self.message = message
self.distinct = distinct
self.url = url
[docs] @staticmethod
def from_dict(obj: Any) -> "Commit":
assert isinstance(obj, dict)
sha = from_union([from_str, from_none], obj.get("sha"))
author = from_union([Author.from_dict, from_none], obj.get("author"))
message = from_union([from_str, from_none], obj.get("message"))
distinct = from_union([from_bool, from_none], obj.get("distinct"))
url = from_union([from_str, from_none], obj.get("url"))
return Commit(sha, author, message, distinct, url)
[docs] def to_dict(self) -> dict:
result: dict = {}
result["sha"] = from_union([from_str, from_none], self.sha)
result["author"] = from_union(
[lambda x: to_class(Author, x), from_none], self.author
)
result["message"] = from_union([from_str, from_none], self.message)
result["distinct"] = from_union([from_bool, from_none], self.distinct)
result["url"] = from_union([from_str, from_none], self.url)
return result
[docs]class Payload(SeriesSerializable):
_series_nested_attrs = ("commits",)
ref: Optional[str]
ref_type: Optional[str]
pusher_type: Optional[str]
push_id: Optional[int]
size: Optional[int]
distinct_size: Optional[int]
head: Optional[str]
before: Optional[str]
commits: Optional[List[Commit]]
[docs] def __init__(
self,
ref: Optional[str],
ref_type: Optional[str],
pusher_type: Optional[str],
push_id: Optional[int],
size: Optional[int],
distinct_size: Optional[int],
head: Optional[str],
before: Optional[str],
commits: Optional[List[Commit]],
) -> None:
self.ref = ref
self.ref_type = ref_type
self.pusher_type = pusher_type
self.push_id = push_id
self.size = size
self.distinct_size = distinct_size
self.head = head
self.before = before
self.commits = commits
[docs] @staticmethod
def from_dict(obj: Any) -> "Payload":
assert isinstance(obj, dict)
ref = from_union([from_str, from_none], obj.get("ref"))
ref_type = from_union([from_str, from_none], obj.get("ref_type"))
pusher_type = from_union([from_str, from_none], obj.get("pusher_type"))
push_id = from_union([from_int, from_none], obj.get("push_id"))
size = from_union([from_int, from_none], obj.get("size"))
distinct_size = from_union([from_int, from_none], obj.get("distinct_size"))
head = from_union([from_str, from_none], obj.get("head"))
before = from_union([from_str, from_none], obj.get("before"))
commits = from_union(
[lambda x: from_list(Commit.from_dict, x), from_none], obj.get("commits")
)
return Payload(
ref,
ref_type,
pusher_type,
push_id,
size,
distinct_size,
head,
before,
commits,
)
[docs] def to_dict(self) -> dict:
result: dict = {}
result["ref"] = from_union([from_str, from_none], self.ref)
result["ref_type"] = from_union([from_str, from_none], self.ref_type)
result["pusher_type"] = from_union([from_str, from_none], self.pusher_type)
result["push_id"] = from_union([from_int, from_none], self.push_id)
result["size"] = from_union([from_int, from_none], self.size)
result["distinct_size"] = from_union([from_int, from_none], self.distinct_size)
result["head"] = from_union([from_str, from_none], self.head)
result["before"] = from_union([from_str, from_none], self.before)
result["commits"] = from_union(
[lambda x: from_list(lambda x: to_class(Commit, x), x), from_none],
self.commits,
)
return result
[docs]class Repo(SeriesSerializable):
id: Optional[int]
name: Optional[str]
url: Optional[str]
[docs] def __init__(
self, id: Optional[int], name: Optional[str], url: Optional[str]
) -> None:
self.id = id
self.name = name
self.url = url
[docs] @staticmethod
def from_dict(obj: Any) -> "Repo":
assert isinstance(obj, dict)
id = from_union([from_int, from_none], obj.get("id"))
name = from_union([from_str, from_none], obj.get("name"))
url = from_union([from_str, from_none], obj.get("url"))
return Repo(id, name, url)
[docs] def to_dict(self) -> dict:
result: dict = {}
result["id"] = from_union([from_int, from_none], self.id)
result["name"] = from_union([from_str, from_none], self.name)
result["url"] = from_union([from_str, from_none], self.url)
return result
[docs]class ArchiveElement(SeriesSerializable):
_series_nested_attrs = ("actor", "repo", "payload")
id: Optional[str]
type: Optional[str]
actor: Optional[Actor]
repo: Optional[Repo]
payload: Optional[Payload]
public: Optional[bool]
created_at: Optional[datetime]
[docs] def __init__(
self,
id: Optional[str],
type: Optional[str],
actor: Optional[Actor],
repo: Optional[Repo],
payload: Optional[Payload],
public: Optional[bool],
created_at: Optional[datetime],
) -> None:
self.id = id
self.type = type
self.actor = actor
self.repo = repo
self.payload = payload
self.public = public
self.created_at = created_at
[docs] @staticmethod
def from_dict(obj: Any) -> "ArchiveElement":
assert isinstance(obj, dict)
id = from_union([from_str, from_none], obj.get("id"))
type = from_union([from_str, from_none], obj.get("type"))
actor = from_union([Actor.from_dict, from_none], obj.get("actor"))
repo = from_union([Repo.from_dict, from_none], obj.get("repo"))
payload = from_union([Payload.from_dict, from_none], obj.get("payload"))
public = from_union([from_bool, from_none], obj.get("public"))
created_at = from_union([from_datetime, from_none], obj.get("created_at"))
return ArchiveElement(id, type, actor, repo, payload, public, created_at)
[docs] def to_dict(self) -> dict:
result: dict = {}
result["id"] = from_union([from_str, from_none], self.id)
result["type"] = from_union([from_str, from_none], self.type)
result["actor"] = from_union(
[lambda x: to_class(Actor, x), from_none], self.actor
)
result["repo"] = from_union([lambda x: to_class(Repo, x), from_none], self.repo)
result["payload"] = from_union(
[lambda x: to_class(Payload, x), from_none], self.payload
)
result["public"] = from_union([from_bool, from_none], self.public)
result["created_at"] = from_union(
[lambda x: x.isoformat(), from_none], self.created_at
)
return result
[docs]class Archive:
data: List[ArchiveElement]
_date_cols: Sequence[str] = ('created_at',)
[docs] def __init__(self, data: List[ArchiveElement]):
self.data = data
[docs] @classmethod
def from_dict_list(cls, data: List[dict]):
archive_elems = [ArchiveElement.from_dict(d) for d in data]
return cls(archive_elems)
[docs] @classmethod
def from_response(cls, resp: requests.Response):
return cls.from_gzip_bytes(resp.content)
[docs] @classmethod
def from_gzip_bytes(cls, b: bytes):
data_str = decompress(b).decode("utf8")
data_strs = [s for s in data_str.split("\n") if s]
del data_str
json_str = "[" + ", ".join(data_strs) + "]"
del data_strs
data = json.loads(json_str)
return cls.from_dict_list(data)
[docs] def to_dict_list(self) -> List[dict]:
return [elem.to_dict() for elem in self]
[docs] def to_df(self) -> pd.DataFrame:
rows = [elem.to_series() for elem in self]
df = pd.DataFrame(rows)
for date_col in self._date_cols:
df[date_col] = pd.to_datetime(df[date_col])
return df
def __add__(self, other):
cls = self.__class__
return cls(self.data + other.data)
def __getitem__(self, item):
return self.data[item]
def __iter__(self):
yield from self.data
[docs] def filter(
self, filters: Sequence[Tuple[str, Union[int, float, str]]]
) -> "Archive":
new_elems = []
for elem in self:
valid = True
for (attr, value) in filters:
getattr_list = attr.split(".")
filter_value = elem
for gattr in getattr_list:
filter_value = getattr(filter_value, gattr)
if filter_value != value:
valid = False
break
if valid:
new_elems.append(elem)
cls = self.__class__
return cls(new_elems)