import gzip import re import os # ==== 参数 ==== gtf_file = "gencode.v48.annotation.gtf.gz" # 你的 GTF 文件 output_file = "human_protein_coding_genes.txt" script_dir = os.path.dirname(os.path.realpath(__file__)) genes = set() # ==== 解析 GTF ==== with gzip.open(f"{script_dir}/{gtf_file}", "rt") as f: for line in f: if line.startswith("#"): continue parts = line.split("\t") if len(parts) > 2 and parts[2] == "gene": # 仅保留 gene_type=protein_coding 的行 if 'gene_type "protein_coding"' in line or 'gene_biotype "protein_coding"' in line: m = re.search(r'gene_name "([^"]+)"', line) if m: genes.add(m.group(1)) # ==== 保存到文件 ==== genes_sorted = sorted(genes) with open(f"{script_dir}/{output_file}", "w") as f: for g in genes_sorted: f.write(g + "\n") print(f"✅ 已提取 {len(genes_sorted)} 个蛋白编码基因 → {output_file}")