- 输入文件:fa格式的文件
- 输出结果:kmer的频数和对应的kmer类型和计数 1.将fq.gz的文件转换成fa文件:
#!/usr/bin/python env# -*- coding:utf-8 -*-import osimport reimport os.pathimport gzipimport sys#在这里可以写一个函数用来将文件转换成id和序列对应的字典#需要用到哪个转化操作呢?考虑先尝试使用filter或者map''' @r261DFRAAXX100204:1:100:10494:3070/1ACTGCATCCTGGAAAGAATCAATGGTGGCCGGAAAGTGTTTTTCAAATACAAGAGTGACAATGTGCCCTGTTGTTT+ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCBC?CCCCCCCCC@@CACCCCCACCCCCCCCCCCCCCCCCCCCCCCC'''#这里是利用python直接读取压缩的fastq文件def read_gz_file(path): if os.path.exists(path): with gzip.open(path,'rt') as pf: for line in pf: yield line else: print 'the path [{}] is not exist!'.format(path)def ReadFastq(fastq): flag = 1 dict_fq={} if fastq.endswith('gz'): con = read_gz_file(fastq) if getattr(con,'__iter__','None'): for line in con: line=line.strip() flag_index = flag%4 if flag_index == 1: id = line if flag%4 == 2: seq = line else: flag +=1 continue dict_fq[id] = seq flag+=1 return dict_fq else: with open (fastq,'r') as fqr: for line in fqr.readlines(): line = line.strip() flag_index = flag%4 if flag_index == 1: id = line if flag%4 == 2: seq = line else: flag +=1 continue dict_fq[id] = seq flag+=1 return dict_fqdef convert_to_fa(dict_hash,output): with open (output,'w') as fr: for i in dict_hash.keys(): fr.write(i+'\n') fr.write(dict_hash[i]+'\n')if __name__ == '__main__': input = sys.argv[1] output = sys.argv[2] dic_fa = ReadFastq(input) convert_to_fa(dic_fa,output)
2.将reads打断成kmer并统计kmer的频数
#!/usr/bin/env python# coding=utf-8import osimport sysimport refrom pyspark import SparkConf, SparkContextinput_fasta_file ='/home/yueyao/Spark/00.data/both.fa'conf = SparkConf().setMaster("local").setAppName("Yue Yao app")sc = SparkContext(conf = conf)fasta_file = sc.textFile(input_fasta_file)#这里是对fasta文件进行转化操作,过滤掉reads的名称reads_fa = fasta_file.filter(lambda line :">" not in line)#这个函数用来将reads打断成kmer,这里的kmer是25,返回一个列表def map_file(line): seq_lis=[] for i in range(len(line)-25+1): sub_seq = line[i:i+25] seq_lis.append(sub_seq) return seq_liskmer_list = reads_fa.flatMap(map_file)#对打断的kmer进行计数kmer_count = kmer_list.map(lambda id:(id,1))kmer_total_count = kmer_count.reduceByKey(lambda a,b:(a+b))#这里过滤掉了含有N的kmerkmer_not_contain_N = kmer_total_count.filter(lambda line :"N" not in line[0])kmer_key=kmer_not_contain_N.keys()#统计kmer的种类,并计数kmer_vari_count = kmer_not_contain_N.map(lambda kmer_vari:(kmer_vari[1],1))kmer_histo = kmer_vari_count.reduceByKey(lambda a,b:(a+b))#输出kmer频数的结果kmer_histo.saveAsTextFile('Kmer25.histo')kmer_not_contain_N.saveAsTextFile('kmer25')kmer_key.saveAsTextFile('kmer25_key')