Coverage for lynceus/devops/github_devops_analyzer.py: 91%

218 statements  

« prev     ^ index     » next       coverage.py v7.10.0, created at 2025-07-29 08:46 +0000

1from collections import defaultdict 

2from datetime import date, datetime, timedelta, timezone 

3from pathlib import Path 

4 

5import github 

6import requests 

7from github import Github, GithubObject, Permissions, UnknownObjectException 

8from github.PaginatedList import PaginatedList 

9 

10from lynceus.core.exchange.lynceus_exchange import LynceusExchange 

11from lynceus.core.lynceus import LynceusSession 

12from lynceus.devops.devops_analyzer import DevOpsAnalyzer 

13from lynceus.utils import filter_kwargs 

14from lynceus.utils.lynceus_dict import LynceusDict 

15 

16 

17def github_exception_handler(func): 

18 """ 

19 Decorator to handle GitHub-specific exceptions and convert them to standard exceptions. 

20 

21 Parameters 

22 ---------- 

23 func : callable 

24 Function to wrap with exception handling 

25 

26 Returns 

27 ------- 

28 callable 

29 Wrapped function that handles GitHub exceptions 

30 

31 Raises 

32 ------ 

33 PermissionError 

34 For 401/403 HTTP status codes 

35 NameError 

36 For 404 HTTP status codes 

37 """ 

38 

39 def func_wrapper(*args, **kwargs): 

40 """ 

41 Internal wrapper function that handles GitHub API errors. 

42 

43 Parameters 

44 ---------- 

45 *args : tuple 

46 Positional arguments passed to the wrapped function 

47 **kwargs 

48 Keyword arguments passed to the wrapped function 

49 

50 Returns 

51 ------- 

52 object 

53 Result of the wrapped function call 

54 

55 Raises 

56 ------ 

57 PermissionError 

58 For 401/403 HTTP status codes 

59 NameError 

60 For 404 HTTP status codes 

61 """ 

62 try: 

63 return func(*args, **kwargs) 

64 except github.GithubException as error: 

65 # Intercepts permission error. 

66 if error.status in (401, 403): 

67 raise PermissionError( 

68 "You don't have enough permission to perform this operation on Github." 

69 ) from error 

70 if error.status == 404: 

71 raise NameError("Unable to find requested Object.") from error 

72 

73 # Raises any other error. 

74 raise 

75 

76 return func_wrapper 

77 

78 

79def get_list_from_paginated_and_count(plist: PaginatedList, count: int | None = None): 

80 """ 

81 Helper function to get a list from GitHub paginated results with optional count limit. 

82 

83 Parameters 

84 ---------- 

85 plist : PaginatedList 

86 GitHub PaginatedList object 

87 count : int, optional 

88 Maximum number of items to retrieve 

89 

90 Returns 

91 ------- 

92 list 

93 List of items from the paginated result, limited by count if specified 

94 """ 

95 # Tries to return cut list according to specified count. 

96 try: 

97 if count is not None and count: 

98 return list(plist[:count]) 

99 except IndexError: 

100 # Ignores IndexError, to returns the complete list, like if count was never specified. 

101 pass 

102 

103 # Returns the complete list. 

104 return list(plist) 

105 

106 

107# Cf. https://pygithub.readthedocs.io/en/latest/introduction.html 

108# Cf. https://docs.github.com/en/rest 

109# Cf. https://pygithub.readthedocs.io/en/stable/changes.html#breaking-changes for 2.x version breaking changes (mainly on datetime which are no more naive). 

110# Important: 

111# - the Gitlab group notion corresponds to Organization notion on Github 

112# - the Team notion of Github is NOT managed 

113class GithubDevOpsAnalyzer(DevOpsAnalyzer): 

114 """ 

115 GitHub-specific implementation of the DevOps analyzer. 

116 

117 This class provides concrete implementations for all DevOps operations 

118 specific to GitHub, including authentication, user/organization/repository management, 

119 and statistics gathering. It uses the PyGithub library to interact with GitHub's REST API. 

120 

121 Important notes: 

122 - GitLab "groups" correspond to GitHub "organizations" 

123 - GitHub "teams" are not currently managed by this implementation 

124 - Some features may have different capabilities compared to GitLab 

125 """ 

126 

127 def __init__( 

128 self, 

129 lynceus_session: LynceusSession, 

130 uri: str, 

131 token: str, 

132 lynceus_exchange: LynceusExchange, 

133 ): 

134 """ 

135 Initialize the GitHub DevOps analyzer. 

136 

137 Parameters 

138 ---------- 

139 lynceus_session : LynceusSession 

140 The Lynceus session instance 

141 uri : str 

142 The GitHub instance URI (for GitHub Enterprise) or None for github.com 

143 token : str 

144 Personal access token for GitHub authentication 

145 lynceus_exchange : LynceusExchange 

146 Exchange instance for data communication 

147 """ 

148 super().__init__(lynceus_session, uri, "github", lynceus_exchange) 

149 kwargs = {"auth": github.Auth.Token(token), "timeout": 60} 

150 if uri: 

151 kwargs["base_url"] = uri 

152 

153 self.__manager = Github(**kwargs) 

154 

155 # The following methods are only for uniformity and coherence. 

156 def _extract_user_info(self, user) -> LynceusDict: 

157 return LynceusDict( 

158 { 

159 "id": user.id, 

160 "name": user.name, 

161 "login": user.login, 

162 "username": user.login, 

163 "e-mail": user.email, 

164 "avatar_url": user.avatar_url, 

165 "bio": user.bio, 

166 } 

167 ) 

168 

169 def _extract_group_info(self, group) -> LynceusDict: 

170 return LynceusDict( 

171 { 

172 "id": group.id, 

173 "name": group.name, 

174 # N.B.: it does not seem to be any "path" for Github organization 

175 "path": group.login, 

176 } 

177 ) 

178 

179 def _extract_project_info(self, project) -> LynceusDict: 

180 return LynceusDict( 

181 { 

182 "id": project.id, 

183 "name": project.name, 

184 "path": project.full_name, 

185 "web_url": project.html_url, 

186 } 

187 ) 

188 

189 def _extract_member_info(self, member) -> LynceusDict: 

190 # In Github the corresponding notion is 'contributors', and there is no 'status' information. 

191 return LynceusDict( 

192 { 

193 "id": member.id, 

194 "name": member.name, 

195 "login": member.login, 

196 "username": member.login, 

197 "parent_id": self.INFO_UNDEFINED, 

198 "state": self.STATUS_ACTIVE, 

199 } 

200 ) 

201 

202 def _extract_issue_event_info(self, issue_event, **kwargs) -> LynceusDict: 

203 return LynceusDict( 

204 { 

205 "id": issue_event.id, 

206 "issue_id": issue_event.issue.number, 

207 "action": issue_event.event, 

208 "target_type": "issue", 

209 "created_at": issue_event.created_at, 

210 "author": ( 

211 self.INFO_UNDEFINED 

212 if not issue_event.actor 

213 else issue_event.actor.name 

214 ), 

215 "title": issue_event.issue.title, 

216 "issue_web_url": kwargs["project_web_url"] + f"/issues/{issue_event.issue.number}", 

217 # N.B.: project information is unable, and must be added by caller via kwargs. 

218 } 

219 | kwargs 

220 ) 

221 

222 def _extract_commit_info(self, commit) -> LynceusDict: 

223 return LynceusDict( 

224 { 

225 "id": commit.sha, 

226 "short_id": commit.sha[:8], 

227 "parent_ids": [parent_commit.sha for parent_commit in commit.parents], 

228 "message": commit.raw_data["commit"].get("message"), 

229 "created_at": commit.commit.author.date, 

230 "author_name": commit.author.name, 

231 "author_email": commit.raw_data["commit"]["author"]["email"], 

232 "committer_name": commit.committer.name, 

233 "committer_email": commit.raw_data["commit"]["committer"]["email"], 

234 } 

235 ) 

236 

237 def _extract_branch_info(self, branch) -> LynceusDict: 

238 return LynceusDict( 

239 { 

240 "name": branch.name, 

241 "merged": len(branch.commit.parents) > 1, 

242 "commit_id": branch.commit.sha, 

243 "commit_short_id": branch.commit.sha[:8], 

244 "created_at": branch.commit.commit.author.date, 

245 } 

246 ) 

247 

248 def _extract_tag_info(self, tag) -> LynceusDict: 

249 return LynceusDict( 

250 { 

251 "name": tag.name, 

252 "commit_id": tag.commit.sha, 

253 "commit_short_id": tag.commit.sha[:8], 

254 "created_at": tag.commit.commit.author.date, 

255 } 

256 ) 

257 

258 # The following methods are only performing read access on DevOps backend. 

259 @github_exception_handler 

260 def authenticate(self): 

261 """ 

262 Authenticate with the GitHub instance using the configured credentials. 

263 

264 GitHub doesn't have a direct authentication endpoint, so this method 

265 validates the credentials by attempting to retrieve the current user 

266 information. If successful, the credentials are valid. 

267 """ 

268 # There is no direct authentication with Github, 

269 # simply retrieves the current user to test it. 

270 self._do_get_current_user() 

271 

272 @github_exception_handler 

273 def _do_get_current_user(self): 

274 return self.__manager.get_user() 

275 

276 # pylint: disable=unused-argument 

277 @github_exception_handler 

278 def _do_get_user_without_cache( 

279 self, *, username: str = None, email: str = None, **kwargs 

280 ): 

281 kwargs_filtered = filter_kwargs(args_filter=["user_id"], **kwargs) 

282 if kwargs_filtered: 

283 return self.__manager.get_user_by_id(**kwargs_filtered) 

284 

285 return self.__manager.get_user(login=username) 

286 

287 @github_exception_handler 

288 def _do_get_groups(self, *, count: int | None = None, **kwargs): 

289 return get_list_from_paginated_and_count( 

290 self.__manager.get_organizations(**kwargs), count 

291 ) 

292 

293 @github_exception_handler 

294 def _do_get_group_without_cache(self, *, full_path: str, **kwargs): 

295 # https://pygithub.readthedocs.io/en/latest/github_objects/Repository.html?highlight=organization#github.Repository.Repository.organization 

296 # https://pygithub.readthedocs.io/en/latest/github_objects/Team.html?highlight=organization#github.Team.Team.organization 

297 # https://pygithub.readthedocs.io/en/latest/examples/MainClass.html?highlight=organization#get-organization-by-name 

298 

299 try: 

300 # There are no optional kwargs available in this implementation. 

301 return self.__manager.get_organization(full_path) 

302 except UnknownObjectException as exception: 

303 raise NameError(f'Group "{full_path}" has not been found.') from exception 

304 

305 @github_exception_handler 

306 def _do_get_projects(self, *, count: int | None = None, **kwargs): 

307 return get_list_from_paginated_and_count( 

308 self._do_get_current_user().get_repos(**kwargs), count 

309 ) 

310 

311 @github_exception_handler 

312 def _do_get_project_without_cache(self, *, full_path: str, **kwargs): 

313 try: 

314 # There are no optional kwargs available in this implementation. 

315 return self.__manager.get_repo(full_path) 

316 except UnknownObjectException as exception: 

317 raise NameError( 

318 f'Project/repository "{full_path}" has not been found.' 

319 ) from exception 

320 

321 @github_exception_handler 

322 def check_permissions_on_project( 

323 self, 

324 *, 

325 full_path: str, 

326 get_metadata: bool, 

327 pull: bool, 

328 push: bool = False, 

329 maintain: bool = False, 

330 admin: bool = False, 

331 **kwargs, 

332 ): 

333 """ 

334 Check user permissions on a specific GitHub repository. 

335 

336 Verify that the authenticated user has the requested permissions 

337 on the specified repository by checking the repository permissions object. 

338 

339 Parameters 

340 ---------- 

341 full_path : str 

342 Full path to the GitHub repository (e.g., 'owner/repo') 

343 get_metadata : bool 

344 Whether metadata access is required 

345 pull : bool 

346 Whether pull/read access is required 

347 push : bool, optional 

348 Whether push/write access is required (default: False) 

349 maintain : bool, optional 

350 Whether maintainer access is required (default: False) 

351 admin : bool, optional 

352 Whether admin access is required (default: False) 

353 **kwargs 

354 Additional repository lookup arguments 

355 

356 Returns 

357 ------- 

358 bool 

359 Permission check results with granted access levels 

360 

361 Raises 

362 ------ 

363 PermissionError 

364 If required permissions are not granted 

365 """ 

366 try: 

367 # Retrieves repository metadata (can lead to NameError if authenticated user has not enough permissions). 

368 repository = self._do_get_project(full_path=full_path, **kwargs) 

369 

370 # From here, we consider get_metadata permission is OK. 

371 

372 # Checks others permissions. 

373 permissions: Permissions = repository.permissions 

374 if pull and not permissions.pull: 

375 return False 

376 

377 if push and not permissions.push: 

378 return False 

379 

380 if maintain and not permissions.maintain: 

381 return False 

382 

383 if admin and not permissions.admin: 

384 return False 

385 

386 # All permissions check are OK. 

387 return True 

388 except (PermissionError, NameError): 

389 # Returns True if there were NO permission at all to check ... 

390 return ( 

391 not get_metadata 

392 and not pull 

393 and not push 

394 and not maintain 

395 and not admin 

396 ) 

397 

398 @github_exception_handler 

399 def _do_get_project_commits( 

400 self, *, full_path: str, git_ref_name: str, count: int | None = None, **kwargs 

401 ): 

402 repository = self._do_get_project(full_path=full_path, **kwargs) 

403 return get_list_from_paginated_and_count( 

404 repository.get_commits(sha=git_ref_name or GithubObject.NotSet), count 

405 ) 

406 

407 @github_exception_handler 

408 def _do_get_project_branches( 

409 self, *, full_path: str, count: int | None = None, **kwargs 

410 ): 

411 repository = self._do_get_project(full_path=full_path, **kwargs) 

412 return get_list_from_paginated_and_count(repository.get_branches(), count) 

413 

414 @github_exception_handler 

415 def _do_get_project_tags( 

416 self, *, full_path: str, count: int | None = None, **kwargs 

417 ): 

418 repository = self._do_get_project(full_path=full_path, **kwargs) 

419 return get_list_from_paginated_and_count(repository.get_tags(), count) 

420 

421 @github_exception_handler 

422 def _do_get_project_members( 

423 self, *, full_path: str, count: int | None = None, **kwargs 

424 ): 

425 repository = self._do_get_project(full_path=full_path, **kwargs) 

426 return get_list_from_paginated_and_count( 

427 repository.get_contributors( 

428 **filter_kwargs(args_filter=["anon"], **kwargs) 

429 ), 

430 count, 

431 ) 

432 

433 @github_exception_handler 

434 def _do_get_group_members( 

435 self, *, full_path: str, count: int | None = None, **kwargs 

436 ): 

437 # https://pygithub.readthedocs.io/en/latest/github_objects/Organization.html?highlight=member#github.Organization.Organization.get_members 

438 organization = self._do_get_group(full_path=full_path, **kwargs) 

439 return get_list_from_paginated_and_count( 

440 organization.get_members( 

441 **filter_kwargs(args_filter=["role", "filter_"], **kwargs) 

442 ), 

443 count, 

444 ) 

445 

446 @github_exception_handler 

447 def _do_get_project_issue_events( 

448 self, 

449 *, 

450 full_path: str, 

451 action: str | None = None, 

452 from_date: datetime | None = None, 

453 to_date: datetime | None = None, 

454 count: int | None = None, 

455 **kwargs, 

456 ): 

457 # https://docs.github.com/en/developers/webhooks-and-events/events/github-event-types 

458 # https://docs.github.com/en/rest/activity/events 

459 

460 repository = self._do_get_project(full_path=full_path, **kwargs) 

461 # Important: unfortunately it is NOT possible to filter on server side ... 

462 issue_events = get_list_from_paginated_and_count( 

463 repository.get_issues_events(), count 

464 ) 

465 

466 # Filters issue event according to specified parameter(s). 

467 for issue_event in issue_events: 

468 if action and issue_event.event != action: 

469 continue 

470 

471 if from_date and issue_event.created_at < from_date: 

472 # Stops the iteration since this issue event is the first being too old. 

473 break 

474 

475 if to_date and issue_event.created_at > to_date: 

476 # Ignores this issue event. 

477 continue 

478 

479 yield issue_event 

480 

481 @github_exception_handler 

482 def _do_get_project_issues( 

483 self, *, full_path: str, count: int | None = None, **kwargs 

484 ): 

485 repository = self._do_get_project(full_path=full_path, **kwargs) 

486 

487 if "state" not in kwargs: 

488 kwargs["state"] = "all" 

489 

490 return get_list_from_paginated_and_count(repository.get_issues(**kwargs), count) 

491 

492 @github_exception_handler 

493 def _do_get_project_merge_requests( 

494 self, *, full_path: str, count: int | None = None, **kwargs 

495 ): 

496 repository = self._do_get_project(full_path=full_path, **kwargs) 

497 

498 if "state" not in kwargs: 

499 kwargs["state"] = "all" 

500 

501 return get_list_from_paginated_and_count(repository.get_pulls(**kwargs), count) 

502 

503 @github_exception_handler 

504 def _do_get_project_milestones( 

505 self, *, full_path: str, count: int | None = None, **kwargs 

506 ): 

507 repository = self._do_get_project(full_path=full_path, **kwargs) 

508 milestone_list = get_list_from_paginated_and_count( 

509 repository.get_milestones(**kwargs), count 

510 ) 

511 return [milestone.title for milestone in milestone_list] 

512 

513 @github_exception_handler 

514 def _do_get_group_milestones( 

515 self, *, full_path: str, count: int | None = None, **kwargs 

516 ): 

517 self._logger.warning( 

518 "There is no milestone on Organization under Github (you need to check milestones directly from a project/repository)." 

519 ) 

520 return [] 

521 

522 @github_exception_handler 

523 def _do_get_group_projects( 

524 self, *, full_path: str, count: int | None = None, **kwargs 

525 ): 

526 organization = self._do_get_group(full_path=full_path, **kwargs) 

527 return get_list_from_paginated_and_count( 

528 organization.get_repos( 

529 **filter_kwargs(args_filter=["type", "sort", "direction"], **kwargs) 

530 ), 

531 count, 

532 ) 

533 

534 @github_exception_handler 

535 def get_user_stats_commit_activity( 

536 self, 

537 *, 

538 group_full_path: str = None, 

539 project_full_path: str = None, 

540 since: datetime = None, 

541 keep_empty_stats: bool = False, 

542 count: int | None = None, 

543 ): 

544 """ 

545 Get commit activity statistics for the authenticated user in GitHub. 

546 

547 Parameters 

548 ---------- 

549 group_full_path : str, optional 

550 Organization path to limit statistics to (unused) 

551 project_full_path : str, optional 

552 Repository path to include in statistics (unused) 

553 since : datetime, optional 

554 Start date for statistics (defaults to 365 days ago) 

555 keep_empty_stats : bool, optional 

556 Whether to include days with zero commits (default: False) 

557 count : int, optional 

558 Maximum number of repositories to analyze 

559 

560 Returns 

561 ------- 

562 dict 

563 Mapping of dates to commit counts 

564 """ 

565 # Cf. https://developer.github.com/v3/repos/statistics/#get-the-last-year-of-commit-activity-data 

566 

567 # Defines threshold date. 

568 contributions_since: datetime = ( 

569 since if since else datetime.now(tz=timezone.utc) - timedelta(days=365) 

570 ) 

571 stats_user_commit_activity: dict[date, int] = defaultdict(int) 

572 for repo in self._do_get_projects(count=count): 

573 for commit_activity in repo.get_stats_commit_activity() or []: 

574 # TODO: define how to ensure it is the good author ? 

575 # if not current_user.name == commit_activity.author: 

576 # print('TODO: remove // AUTHOR is NOT the same: ', current_user.name, commit_activity.author) 

577 # continue 

578 

579 # Ignores oldest statistics. 

580 if commit_activity.week < contributions_since: 

581 continue 

582 

583 # Ignores 0 stats but if wanted. 

584 if not keep_empty_stats and not commit_activity.total: 

585 continue 

586 

587 # For each stats. 

588 for day, commit_count in enumerate(commit_activity.days): 

589 # Ignores 0 stats but if wanted. 

590 if not keep_empty_stats and not commit_count: 

591 continue 

592 

593 day_date = commit_activity.week + timedelta(days=day) 

594 stats_user_commit_activity[day_date.date()] += commit_count 

595 

596 return stats_user_commit_activity 

597 

598 @github_exception_handler 

599 def get_user_contributions( 

600 self, 

601 *, 

602 since: datetime = None, 

603 keep_empty_stats: bool = False, 

604 count: int | None = None, 

605 ): 

606 """ 

607 Get detailed user contribution statistics including additions, deletions, and commits. 

608 

609 Parameters 

610 ---------- 

611 since : datetime, optional 

612 Start date for contributions (defaults to 365 days ago) 

613 keep_empty_stats : bool, optional 

614 Whether to include periods with zero contributions (default: False) 

615 count : int, optional 

616 Maximum number of repositories to analyze 

617 

618 Returns 

619 ------- 

620 dict 

621 Mapping of dates to contribution statistics (additions, deletions, commits) 

622 """ 

623 # Cf. https://developer.github.com/v3/repos/statistics/ 

624 

625 # Defines threshold date. 

626 contributions_since: datetime = ( 

627 since if since else datetime.now(tz=timezone.utc) - timedelta(days=365) 

628 ) 

629 current_user = self._do_get_current_user() 

630 stats_user_contributions: dict[date, dict[str, int]] = {} 

631 for repo in self._do_get_projects(count=count): 

632 for contribution in repo.get_stats_contributors() or []: 

633 if not current_user.name == contribution.author: 

634 continue 

635 

636 # For each stats. 

637 for stats in contribution.weeks: 

638 # Ignores oldest statistics. 

639 if stats.w < contributions_since: 

640 continue 

641 

642 # Ignores 0 stats but if wanted. 

643 if ( 

644 not keep_empty_stats 

645 and not stats.a 

646 and not stats.d 

647 and not stats.c 

648 ): 

649 continue 

650 

651 try: 

652 stats_map = stats_user_contributions[stats.w.date()] 

653 except KeyError: 

654 stats_map = {"additions": 0, "deletions": 0, "commits": 0} 

655 stats_user_contributions[stats.w.date()] = stats_map 

656 

657 stats_map["additions"] += stats.a 

658 stats_map["deletions"] += stats.d 

659 stats_map["commits"] += stats.c 

660 

661 return stats_user_contributions 

662 

663 @github_exception_handler 

664 def get_user_stats_code_frequency(self, *, count: int | None = None): 

665 """ 

666 Get code frequency statistics showing additions and deletions over time. 

667 

668 Parameters 

669 ---------- 

670 count : int, optional 

671 Maximum number of repositories to analyze 

672 

673 Returns 

674 ------- 

675 dict 

676 Mapping of time periods to code frequency data (additions, deletions) 

677 """ 

678 stats_code_frequency = {} 

679 for repo in self._do_get_projects(count=count): 

680 for stats in repo.get_stats_code_frequency() or []: 

681 try: 

682 stats_map = stats_code_frequency[stats.week] 

683 except KeyError: 

684 stats_map = {"additions": 0, "deletions": 0} 

685 stats_code_frequency[stats.week] = stats_map 

686 

687 stats_map["additions"] += stats.additions 

688 stats_map["deletions"] += stats.deletions 

689 return stats_code_frequency 

690 

691 @github_exception_handler 

692 def _do_download_repository( 

693 self, 

694 *, 

695 project_full_path: str, 

696 dest_path: Path, 

697 reference: str = None, 

698 chunk_size: int = 1024, 

699 **kwargs, 

700 ): 

701 # Cf. https://developer.github.com/v3/repos/contents/#get-archive-link 

702 # Cf. https://github.com/PyGithub/PyGithub/blob/main/github/Repository.py#L1535 

703 repository = self._do_get_project(full_path=project_full_path, **kwargs) 

704 url: str = repository.get_archive_link( 

705 archive_format="tarball", ref=reference or GithubObject.NotSet 

706 ) 

707 with requests.get(url, stream=True, timeout=60) as reader: 

708 # Ensures specified reference exists. 

709 if reader.status_code == 404: 

710 raise NameError( 

711 f'Reference "{reference}" does not exists for project/repository "{project_full_path}".' 

712 ) 

713 

714 with open(dest_path, "wb") as dest_file: 

715 for chunk in reader.iter_content(chunk_size=chunk_size): 

716 dest_file.write(chunk)