digital-embryo/embryo-backend/Data/GetGeneName.py
2025-07-27 17:57:58 +08:00

31 lines
994 B
Python

import gzip
import re
import os
# ==== 参数 ====
gtf_file = "gencode.v48.annotation.gtf.gz" # 你的 GTF 文件
output_file = "human_protein_coding_genes.txt"
script_dir = os.path.dirname(os.path.realpath(__file__))
genes = set()
# ==== 解析 GTF ====
with gzip.open(f"{script_dir}/{gtf_file}", "rt") as f:
for line in f:
if line.startswith("#"):
continue
parts = line.split("\t")
if len(parts) > 2 and parts[2] == "gene":
# 仅保留 gene_type=protein_coding 的行
if 'gene_type "protein_coding"' in line or 'gene_biotype "protein_coding"' in line:
m = re.search(r'gene_name "([^"]+)"', line)
if m:
genes.add(m.group(1))
# ==== 保存到文件 ====
genes_sorted = sorted(genes)
with open(f"{script_dir}/{output_file}", "w") as f:
for g in genes_sorted:
f.write(g + "\n")
print(f"✅ 已提取 {len(genes_sorted)} 个蛋白编码基因 → {output_file}")