import gzip
import re
import os

# ==== 参数 ====
gtf_file = "gencode.v48.annotation.gtf.gz"   # 你的 GTF 文件
output_file = "human_protein_coding_genes.txt"
script_dir = os.path.dirname(os.path.realpath(__file__))
genes = set()

# ==== 解析 GTF ====
with gzip.open(f"{script_dir}/{gtf_file}", "rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        parts = line.split("\t")
        if len(parts) > 2 and parts[2] == "gene":
            # 仅保留 gene_type=protein_coding 的行
            if 'gene_type "protein_coding"' in line or 'gene_biotype "protein_coding"' in line:
                m = re.search(r'gene_name "([^"]+)"', line)
                if m:
                    genes.add(m.group(1))

# ==== 保存到文件 ====
genes_sorted = sorted(genes)
with open(f"{script_dir}/{output_file}", "w") as f:
    for g in genes_sorted:
        f.write(g + "\n")

print(f"✅ 已提取 {len(genes_sorted)} 个蛋白编码基因 → {output_file}")