|
|
|
|
|
import os |
|
|
|
|
|
os.chdir(os.path.dirname(os.path.abspath(__file__))) |
|
|
|
|
|
from typing import List, Dict, Any, Optional |
|
|
from pydantic import BaseModel |
|
|
import pandas as pd |
|
|
import shutil |
|
|
import subprocess |
|
|
import requests |
|
|
import argparse |
|
|
import asyncio |
|
|
from pathlib import Path |
|
|
from dotenv import load_dotenv |
|
|
from datetime import datetime, timedelta |
|
|
from util import init_logger, logger, call_llm, CODE_EXTENSIONS |
|
|
from langchain_core.output_parsers import JsonOutputParser |
|
|
|
|
|
|
|
|
|
|
|
class RelevanceResult(BaseModel): |
|
|
relevant: str |
|
|
reason: str |
|
|
|
|
|
|
|
|
class ExpandedKeywords(BaseModel): |
|
|
keywords: List[str] |
|
|
|
|
|
|
|
|
def search_github_repos(keywords: List[str], token: str, output_csv: Path): |
|
|
"""Search GitHub repos with incremental write per keyword using pending logic""" |
|
|
headers = {"Authorization": f"Bearer {token}", "Accept": "application/vnd.github.v3+json"} |
|
|
|
|
|
|
|
|
all_keywords = set(k.lower() for k in keywords) |
|
|
searched_keywords = set() |
|
|
|
|
|
if output_csv.exists(): |
|
|
df_existing = pd.read_csv(output_csv) |
|
|
searched_keywords = set(df_existing["keyword"].str.lower().unique()) |
|
|
logger.info(f"Resume: Already searched {len(searched_keywords)} keywords") |
|
|
|
|
|
|
|
|
pending_keywords = all_keywords - searched_keywords |
|
|
pending = [k for k in keywords if k.lower() in pending_keywords] |
|
|
|
|
|
logger.info(f"Pending: {len(pending)} keywords to search") |
|
|
|
|
|
|
|
|
global_seen = set() |
|
|
if output_csv.exists(): |
|
|
global_seen = set(df_existing["url"].tolist()) |
|
|
|
|
|
def get_count(query: str) -> int: |
|
|
"""Get total count without fetching data""" |
|
|
try: |
|
|
resp = requests.get( |
|
|
"https://api.github.com/search/repositories", |
|
|
headers=headers, |
|
|
params={"q": query, "per_page": 1}, |
|
|
) |
|
|
if resp.status_code == 200: |
|
|
return resp.json().get("total_count", 0) |
|
|
except: |
|
|
pass |
|
|
return 0 |
|
|
|
|
|
def fetch_repos(query: str, keyword: str, local_repos: List[Dict]): |
|
|
"""Fetch all results for a single query""" |
|
|
page = 1 |
|
|
while page <= 10: |
|
|
try: |
|
|
resp = requests.get( |
|
|
"https://api.github.com/search/repositories", |
|
|
headers=headers, |
|
|
params={"q": query, "per_page": 100, "page": page}, |
|
|
) |
|
|
if resp.status_code != 200: |
|
|
break |
|
|
items = resp.json().get("items", []) |
|
|
if not items: |
|
|
break |
|
|
|
|
|
for r in items: |
|
|
url = r.get("html_url", "") |
|
|
if url and url not in global_seen: |
|
|
global_seen.add(url) |
|
|
repo_data = { |
|
|
"keyword": keyword, |
|
|
"name": r.get("name", ""), |
|
|
"full_name": r.get("full_name", ""), |
|
|
"owner": r.get("owner", {}).get("login", ""), |
|
|
"url": url, |
|
|
"description": r.get("description") or "", |
|
|
"language": r.get("language") or "", |
|
|
"topics": ",".join(r.get("topics", [])), |
|
|
"stars": r.get("stargazers_count", 0), |
|
|
"forks": r.get("forks_count", 0), |
|
|
"created_at": r.get("created_at", ""), |
|
|
"updated_at": r.get("updated_at", ""), |
|
|
"pushed_at": r.get("pushed_at", ""), |
|
|
"license": r.get("license", {}).get("spdx_id", "") if r.get("license") else "", |
|
|
"default_branch": r.get("default_branch", ""), |
|
|
"open_issues": r.get("open_issues_count", 0), |
|
|
"size": r.get("size", 0), |
|
|
"has_wiki": r.get("has_wiki", False), |
|
|
"archived": r.get("archived", False), |
|
|
} |
|
|
local_repos.append(repo_data) |
|
|
|
|
|
if len(items) < 100: |
|
|
break |
|
|
page += 1 |
|
|
except Exception as e: |
|
|
logger.error(f"Fetch error: {e}") |
|
|
break |
|
|
|
|
|
def split_by_date(kw: str, keyword: str, start_date: datetime, end_date: datetime, local_repos: List[Dict]): |
|
|
"""Recursive date splitting with stars>10 and in:readme filters""" |
|
|
start_str = start_date.strftime("%Y-%m-%d") |
|
|
end_str = end_date.strftime("%Y-%m-%d") |
|
|
query = f"{kw} in:readme stars:>10 created:{start_str}..{end_str}" |
|
|
count = get_count(query) |
|
|
logger.info(f" {start_str} to {end_str}: {count} repos") |
|
|
|
|
|
if count == 0: |
|
|
return |
|
|
elif count <= 1000: |
|
|
fetch_repos(query, keyword, local_repos) |
|
|
else: |
|
|
days = (end_date - start_date).days |
|
|
if days == 0: |
|
|
logger.warning(f"Single day has {count} repos, getting first 1000: {start_str}") |
|
|
fetch_repos(query, keyword, local_repos) |
|
|
else: |
|
|
mid_days = days // 2 |
|
|
mid_date = start_date + timedelta(days=mid_days) |
|
|
split_by_date(kw, keyword, start_date, mid_date, local_repos) |
|
|
split_by_date(kw, keyword, mid_date + timedelta(days=1), end_date, local_repos) |
|
|
|
|
|
|
|
|
for kw in pending: |
|
|
logger.info(f"Searching keyword: {kw}") |
|
|
keyword_repos = [] |
|
|
start = datetime(2008, 1, 1) |
|
|
end = datetime.now() |
|
|
split_by_date(kw, kw, start, end, keyword_repos) |
|
|
|
|
|
|
|
|
if keyword_repos: |
|
|
df_new = pd.DataFrame(keyword_repos) |
|
|
df_new.to_csv(output_csv, mode="a", header=not output_csv.exists(), index=False, encoding="utf-8") |
|
|
logger.info(f"✓ Saved {len(keyword_repos)} repos for keyword: {kw}") |
|
|
else: |
|
|
logger.info(f"✓ No new repos for keyword: {kw}") |
|
|
|
|
|
logger.info(f"Total repos in CSV: {len(global_seen)}") |
|
|
|
|
|
|
|
|
def get_readme(owner: str, repo: str, token: str) -> str: |
|
|
"""Fetch README content from repo""" |
|
|
try: |
|
|
resp = requests.get( |
|
|
f"https://api.github.com/repos/{owner}/{repo}/readme", |
|
|
headers={"Authorization": f"Bearer {token}", "Accept": "application/vnd.github.v3.raw"}, |
|
|
) |
|
|
return resp.text if resp.status_code == 200 else "" |
|
|
except: |
|
|
return "" |
|
|
|
|
|
|
|
|
async def check_relevance( |
|
|
repo: Dict, keywords: List[str], model: str, base_url: str, api_key: str, token: str, log_file: str |
|
|
) -> bool: |
|
|
"""Use LLM to check if repo is relevant to keywords""" |
|
|
readme = get_readme(repo["owner"], repo["name"], token)[:8000] |
|
|
|
|
|
prompt = f"""Determine if this GitHub repository is relevant to the keywords: {', '.join(keywords)} |
|
|
|
|
|
Repository: {repo['name']} |
|
|
Description: {repo['description']} |
|
|
Language: {repo['language']} |
|
|
README (truncated): |
|
|
{readme} |
|
|
|
|
|
Answer 'YES' if the repository is related to any of the keywords, 'NO' otherwise. |
|
|
Provide your reasoning in the reason field.""" |
|
|
|
|
|
try: |
|
|
result = await call_llm( |
|
|
[{"role": "user", "content": prompt}], |
|
|
model, |
|
|
base_url, |
|
|
api_key, |
|
|
pydantic_object=RelevanceResult, |
|
|
log_file=log_file, |
|
|
temperature=0.1, |
|
|
) |
|
|
return result.get("relevant", "").upper() == "YES" |
|
|
except Exception as e: |
|
|
logger.error(f"LLM error for {repo['name']}: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def save_csv(repos: List[Dict], path: str): |
|
|
"""Save repos to CSV using pandas""" |
|
|
df = pd.DataFrame(repos) |
|
|
df.to_csv(path, index=False, encoding="utf-8") |
|
|
logger.info(f"Saved {len(repos)} repos to {path}") |
|
|
|
|
|
|
|
|
def clone_repos(csv_path: str, dest_dir: Path): |
|
|
"""Clone repos with resume support using full_name""" |
|
|
dest_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
df = pd.read_csv(csv_path) |
|
|
to_clone = df.to_dict("records") |
|
|
total_repos = len(to_clone) |
|
|
|
|
|
existing_dirs = set(d.name for d in dest_dir.iterdir() if d.is_dir() and not d.name.startswith(".")) |
|
|
|
|
|
|
|
|
def is_cloned(full_name: str) -> bool: |
|
|
owner, repo = full_name.split("/") |
|
|
patterns = [f"{owner}___{repo}"] |
|
|
return any(p in existing_dirs for p in patterns) |
|
|
|
|
|
|
|
|
pending = [r for r in to_clone if not is_cloned(r["full_name"])] |
|
|
already_count = total_repos - len(pending) |
|
|
logger.info(f"Resume: Already cloned {already_count}/{total_repos} repos") |
|
|
logger.info(f"Pending: {len(pending)} repos to clone") |
|
|
|
|
|
for i, row in enumerate(pending): |
|
|
full_name = row["full_name"] |
|
|
repo_path = dest_dir / full_name.replace("/", "___") |
|
|
try: |
|
|
subprocess.run( |
|
|
["git", "clone", "--depth", "1", row["url"], str(repo_path)], check=True, capture_output=True |
|
|
) |
|
|
logger.info(f"[{already_count + i + 1}/{total_repos}] Cloned: {full_name}") |
|
|
except Exception as e: |
|
|
logger.error(f"[{already_count + i + 1}/{total_repos}] Clone failed {full_name}: {e}") |
|
|
|
|
|
|
|
|
async def main(): |
|
|
load_dotenv() |
|
|
parser = argparse.ArgumentParser(description="GitHub Repo Crawler") |
|
|
parser.add_argument( |
|
|
"--keywords", |
|
|
type=str, |
|
|
default="Chemistry, Biology, Biochemistry, Omics, Medicine, Pharmacology, Toxicology, Bioinformatics, Bioengineering, Biophysics, Viral, Microbial, Prediction, Discovery, Protein, Gene, DNA, RNA, Vaccine, Computational Biology, Computational Biochemistry, Computational Chemistry, Computational Materials, Quantum Chemistry, Disease, Biomedical, Material, Pharmacogenetics, Pharmacogenomics, Modeling, Networks, In Silico, Pathology, Physiology, Genomics, Proteomics, Transcriptomics, Metabolomics, Glycomics, Lipidomics, Immunology, Microbiology, Molecular biology, Pharmaceutics, Network pharmacology, Epigenetics, Sequencing, Design, Multi-omics, Biomarker, System biology, Synthetic biology, Cell biology, Cancer biology, Ensemble, Personalized, Lipid, Metabolic, Genesis, Ion, Heterogeneity, Generative, Generate, Human, Receptor, Ligand, Organoid, Evolution, Pathogens, Homeostasis, Allele, Genotype, Phenotype, Antibody, Antigen, Nucleic acids, Carbohydrate, Substrate, Inhibition, Activation, Allosteric, Cofactor, Coenzyme, Enzyme, Redox, Hydrophilic, Hydrophobic, Codon, Transcription, Translation, Pathway, Cycle, Signaling, Dynamics, Kinetics, Docking, Spectrometry, Profiling, Diagnostics, CRISPR, Bio, Marker, Pharmacokinetics, Pharmacodynamics, Absorption, Mechanism of action, Agonist, Antagonist, Bioavailability, Half-life, Reaction, Drug, Biologics, Pharmacometrics, Beta-blocker, Regulatory networks, Multi-scale modeling, Single-cell, Spatial biology, Integration, Monte Carlo, System immunology, Metagenomics, QSAR, QAPR, Chemical space, AlphaFold, Folding, Mechanism, Digital twin, Virtual human, Gene editing, Bio foundation model, Biotechnology, Assay, Lead discovery, High-throughput, Screening, Hit-to-lead, Lead optimization, De novo, ADMET, Translational medicine, Drug repurpose, Conjugate, Agent-based model, Compartmental model, Reproduction number, Nowcasting, Phylodynamic model, Physiologically based pharmacokinetics model, PBPK model, Organ-on-a-chip, Anomaly detection, Stochastic modeling, Genomic surveillance, Antimicrobial resistance modeling, AMR, Pandemic, Digital PCR, Next-generation sequencing, Biosensors, Imaging, Sensors, Quantum mechanics, DFT, Ab initio, Hartree-Fock, Coupled cluster, Electronic structure, Homo-Lumo, Conformation, Cheminformatics, QM/MM, First-principles based DFT, Diffusion, Finite element method, Phase-field technique, Potential, Metamaterial, 2D, 3D, Porous, Crystal, Rosettafold, Gene regulatory networks, Cell atlas, Human atlas, Spatial transcriptomics, Pseudotime analysis, Quantum biology, Metabolic flux analysis, Free energy perturbation, Protein-protein, Explainable AI, Neurology, Reinforcement learning, Generative AI, Flow matching, Generative adversarial networks, GAN, Variational autoencoders, VAE, Autoregressive, Transformer, Recurrent neural networks, RNN, Score", |
|
|
help="Comma-separated keywords", |
|
|
) |
|
|
parser.add_argument("--workdir", type=str, default="./workdir", help="Working directory") |
|
|
parser.add_argument("--model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o")) |
|
|
parser.add_argument("--base_url", type=str, default=os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")) |
|
|
parser.add_argument("--api_key", type=str, default=os.getenv("OPENAI_API_KEY")) |
|
|
args = parser.parse_args() |
|
|
|
|
|
workdir = Path(args.workdir) |
|
|
config = { |
|
|
"workdir": workdir, |
|
|
"keywords_expanded": workdir / "keywords_expanded.json", |
|
|
"repos_searched": workdir / "repos_searched.csv", |
|
|
"repos_checked": workdir / "repos_checked.csv", |
|
|
"repos_raw": workdir / "repos_raw", |
|
|
"repos_filtered": workdir / "repos_filtered", |
|
|
"log_file": str(workdir / "calls_llm.jsonl"), |
|
|
"model": args.model, |
|
|
"base_url": args.base_url, |
|
|
"api_key": args.api_key, |
|
|
"github_token": os.environ.get("GITHUB_TOKEN"), |
|
|
"keywords": [k.strip() for k in args.keywords.split(",") if k.strip()], |
|
|
} |
|
|
|
|
|
|
|
|
os.makedirs(config["workdir"], exist_ok=True) |
|
|
init_logger(str(config["workdir"] / "run.log")) |
|
|
logger.info(f"Base keywords: {config['keywords']}") |
|
|
logger.info(f"Model: {config['model']}") |
|
|
|
|
|
|
|
|
if config["keywords_expanded"].exists(): |
|
|
logger.info(f"[Skip] Step 0: {config['keywords_expanded']} exists") |
|
|
import json |
|
|
|
|
|
with open(config["keywords_expanded"], "r") as f: |
|
|
expanded = json.load(f)["keywords"] |
|
|
else: |
|
|
logger.info("=" * 60 + "\nStep 0: Expand Keywords with LLM\n" + "=" * 60) |
|
|
parser = JsonOutputParser(pydantic_object=ExpandedKeywords) |
|
|
messages = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": f"You are an assistant that generates diverse and related keywords for scientific disciplines.\n{parser.get_format_instructions()}", |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f"""Generate a list of exactly 5 diverse keywords related to these scientific fields: {', '.join(config['keywords'])}. |
|
|
Make sure that the generated keywords do not stray away from these scientific disciplines and do not contain broad terms that will confuse the search (e.g. machine learning, algorithms, etc). |
|
|
I would like to use these keywords to retrieve code repositories related to these specific scientific disciplines from GitHub and Papers with Code.""", |
|
|
}, |
|
|
] |
|
|
|
|
|
try: |
|
|
result = await call_llm( |
|
|
messages, |
|
|
config["model"], |
|
|
config["base_url"], |
|
|
config["api_key"], |
|
|
pydantic_object=ExpandedKeywords, |
|
|
log_file=config["log_file"], |
|
|
temperature=0.5, |
|
|
) |
|
|
|
|
|
expanded = list(set(config["keywords"] + result.get("keywords", []))) |
|
|
except Exception as e: |
|
|
logger.error(f"Keyword expansion failed: {e}, using base keywords") |
|
|
expanded = config["keywords"] |
|
|
|
|
|
|
|
|
import json |
|
|
|
|
|
with open(config["keywords_expanded"], "w") as f: |
|
|
json.dump({"keywords": expanded}, f, indent=2) |
|
|
logger.info(f"[Done] Step 0: {len(expanded)} keywords: {expanded}") |
|
|
|
|
|
|
|
|
logger.info("=" * 60 + "\nStep 1: Search GitHub Repos\n" + "=" * 60) |
|
|
search_github_repos(expanded, config["github_token"], config["repos_searched"]) |
|
|
|
|
|
|
|
|
if config["repos_searched"].exists(): |
|
|
df_final = pd.read_csv(config["repos_searched"]) |
|
|
logger.info(f"[Done] Step 1: {len(df_final)} total repos in CSV") |
|
|
else: |
|
|
logger.warning("No repos found") |
|
|
pd.DataFrame( |
|
|
columns=[ |
|
|
"keyword", |
|
|
"name", |
|
|
"full_name", |
|
|
"owner", |
|
|
"url", |
|
|
"description", |
|
|
"language", |
|
|
"topics", |
|
|
"stars", |
|
|
"forks", |
|
|
"created_at", |
|
|
"updated_at", |
|
|
"pushed_at", |
|
|
"license", |
|
|
"default_branch", |
|
|
"open_issues", |
|
|
"size", |
|
|
"has_wiki", |
|
|
"archived", |
|
|
] |
|
|
).to_csv(config["repos_searched"], index=False) |
|
|
logger.info("[Done] Step 1: 0 repos saved") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
asyncio.run(main()) |
|
|
|