31 lines
994 B
Python
31 lines
994 B
Python
import gzip
|
|
import re
|
|
import os
|
|
|
|
# ==== 参数 ====
|
|
gtf_file = "gencode.v48.annotation.gtf.gz" # 你的 GTF 文件
|
|
output_file = "human_protein_coding_genes.txt"
|
|
script_dir = os.path.dirname(os.path.realpath(__file__))
|
|
genes = set()
|
|
|
|
# ==== 解析 GTF ====
|
|
with gzip.open(f"{script_dir}/{gtf_file}", "rt") as f:
|
|
for line in f:
|
|
if line.startswith("#"):
|
|
continue
|
|
parts = line.split("\t")
|
|
if len(parts) > 2 and parts[2] == "gene":
|
|
# 仅保留 gene_type=protein_coding 的行
|
|
if 'gene_type "protein_coding"' in line or 'gene_biotype "protein_coding"' in line:
|
|
m = re.search(r'gene_name "([^"]+)"', line)
|
|
if m:
|
|
genes.add(m.group(1))
|
|
|
|
# ==== 保存到文件 ====
|
|
genes_sorted = sorted(genes)
|
|
with open(f"{script_dir}/{output_file}", "w") as f:
|
|
for g in genes_sorted:
|
|
f.write(g + "\n")
|
|
|
|
print(f"✅ 已提取 {len(genes_sorted)} 个蛋白编码基因 → {output_file}")
|