아래 과정은 snp chip에서 case-control GWAS study까지의 대략적인 코드이다.
Parse raw file
describe parsing script.
In [1]:
#parse raw file
import csv
handle = open("snp_chips.csv")
reader = csv.reader(handle)
replicate = 4
start_column = -16 * replicate + replicate - 1
samples = reader.next()[start_column : : replicate]
samples = ['snp_id'] + [sample.split('.')[0] for sample in samples]
print 'samples ::', samples
print 'sample count ::', len(samples) - 1
str_output_ori = '{}\n'.format(','.join(samples))
str_output_val = str_output_ori
str_output_val_filtered = str_output_val
str_output_ori = 'chr,pos,' + str_output_ori
pure_row_count = 0
for sp in reader:
if sp[0][0] == '#':
continue
snps = [sp[1]] + sp[start_column :: replicate]
str_snps = ','.join(snps)
str_output_ori += '{0},{1},{2}\n'.format(int(sp[53][-2:]),
sp[56], str_snps)
str_snps = str_snps.replace("AAAA","0")
str_snps = str_snps.replace("AAAC","1")
str_snps = str_snps.replace("AACC","2")
str_snps = str_snps.replace("ACCC","3")
str_snps = str_snps.replace("CCCC","4")
str_snps = str_snps.replace("AAAG","1")
str_snps = str_snps.replace("AAGG","2")
str_snps = str_snps.replace("AGGG","3")
str_snps = str_snps.replace("GGGG","4")
str_snps = str_snps.replace("TTTT","0")
str_snps = str_snps.replace("TTTC","1")
str_snps = str_snps.replace("TTCC","2")
str_snps = str_snps.replace("TCCC","3")
str_snps = str_snps.replace("CCCC","4")
str_snps = str_snps.replace("TTTG","1")
str_snps = str_snps.replace("TTGG","2")
str_snps = str_snps.replace("TGGG","3")
str_snps = str_snps.replace("GGGG","4")
str_snps = str_snps.replace("X","nan")
str_snps = str_snps.replace("NC","nan")
str_output_val += '{}\n'.format(str_snps)
if 'nan' not in str_snps:
pure_row_count += 1
str_output_val_filtered += '{}\n'.format(str_snps)
#print 'snps which all samples have value :: ', pure_row_count
handle.close()
handle_w = open('snp_chips.parsed.csv','w')
handle_w.write(str_output_ori)
handle_w.close()
handle_w = open('snp_chips.parsed.value.csv','w')
handle_w.write(str_output_val)
handle_w.close()
handle_w = open('snp_chips.parsed.value.filterd.csv','w')
handle_w.write(str_output_val_filtered)
handle_w.close()
SNP simple stats
In [2]:
#SNP simple stats
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('snp_chips.parsed.value.csv', index_col=0)
## hist duplicated probes
duplicated_snps = []
indexes = list(data.index)
for snp in indexes:
duplicated_count = indexes.count(snp)
if duplicated_count > 1:
duplicated_snps.append(snp)
print "duplicate probes :: ", len(set(duplicated_snps))
data["index"] = data.index
data.drop_duplicates(cols='index', take_last=True, inplace=True) # remove
# duplicate row
del data["index"]
## plot non-nan count of samples
snp_count_per_sample = data.apply(lambda x: sum(x.notnull()))
fig, ax = plt.subplots(figsize=(15,4))
ax.plot(snp_count_per_sample, marker='o', ls='-', color='blue')
ax.set_xticks(range(0,16))
ax.set_xticklabels(snp_count_per_sample.index, rotation=-45)
ax.set_xlim([-1,16])
ax.grid(True)
ax.set_title('# of SNPs with value per sample')
ax.set_xlabel('sample names')
ax.set_ylabel('# of SNPs with value')
for i in range(len(snp_count_per_sample.index)):
ax.text(i, snp_count_per_sample[i], snp_count_per_sample[i],
ha='center', va='bottom')
In [3]:
data = data.loc[data.apply(lambda x: sum(x.isnull()),axis=1) == 0,:]
print 'filtered data ::', data.shape
PCA dimension reduction
In [4]:
#PCA dimension reduction
from sklearn.decomposition import PCA
import math
import numpy as np
pca = PCA(n_components=3)
pca.fit(data.T)
tc = pca.transform(data.T)
colors = cm.rainbow(np.linspace(0, 1, len(tc)))
fig, ax = plt.subplots(figsize=(10,10))
for i, sample in enumerate(data.columns):
ax.plot(tc[i, 0], tc[i, 1], marker='o', ls='*', color=colors[i])
ax.text(tc[i, 0], tc[i, 1], sample, ha='left', va='bottom',
color=colors[i])
ax.set_title('PCA reduction')
ax.set_xlabel('principle component 1')
ax.set_ylabel('principle component 2')
Out[4]:
Phylogenetic tree
- Calculate genetic distance
- Calculate Genetic distance (http://www.plosgenetics.org/article/info%3Adoi%2F10.1371%2Fjournal.pgen.1002451)
: Genetic distance (D) between pair-wise combinations of individuals was calculated as PLINK where
\[ D = 1−{(IBS2+0.5 \times IBS1) \over N} \]
IBS2 and IBS1 are the number of loci that share either 2 or 1 alleles identical by state (IBS), respectively, and N is the number of loci tested. - Make distance matrix file (phylip input file)
: The input format for distance data is straightforward. The first line of the input file contains the number of species. There follows species data, starting, as with all other programs, with a species name. The species name is ten characters long, and must be padded out with blanks if shorter. And every cell should be space delimited.
In [7]:
## calculate genetic distance
import math
distance_file = 'distance_matrix.txt'
def calculate_distance(s1, s2, allele_num):
return 1 - (allele_num - (s1 - s2).apply(math.fabs)).sum() / (allele_num * len(s1))
distance_matrix = data.apply(lambda x1: data.apply(
lambda x2: calculate_distance(x1, x2, 4)))
str_output = '{0}\n'.format(len(distance_matrix))
for sample in distance_matrix.index:
sample_name = sample[:10] if len(sample) > 10 else sample + ' '*(10-len(sample))
str_output += '{0} {1}\n'.format(sample_name,
' '.join(['{:.4f}'.format(v) for v in distance_matrix.loc[sample,]]))
handle = open(distance_file, 'w')
handle.write(str_output)
handle.close()
- Convert distance matrix to newick format
Use Phylip (check "Running the programs in background or under control of a command file" in http://evolution.genetics.washington.edu/phylip/doc/main.html).
In [8]:
import os
from IPython.display import Image
if os.path.isfile('outfile'):
os.remove('outfile')
if os.path.isfile('outtree'):
os.remove('outtree')
str_output = '{0}\nY\n'.format(distance_file)
handle = open('fitch_input','w')
handle.write(str_output)
handle.close()
os.system('fitch < fitch_input > fitch_result')
Out[8]:
- Draw tree
use phylip drawgram, drawtree
In [9]:
## draw tree with python package "ete2"
from ete2 import Tree, TreeStyle
handle = open('outtree')
lines = handle.readlines()
handle.close()
newick = ''.join([line.strip() for line in lines])
t = Tree(newick)
t.show()
#t.render('myTree.png', w=150, h = 150, units='mm')
#Image(filename='myTree.png')
In [10]:
## draw tree with phylip "drawtree"
## phylip "drawtree" make ps format file. Converting file format is needed to render image.
loc_fontfile = 'font1' # check the location of fontfile
handle = open('drawtree_input', 'w')
handle.write('outtree\n{0}\nY\n'.format(loc_fontfile))
handle.close()
commands = []
commands.append('drawtree < drawtree_input > drawtree_result') # use phylip "drawtree"
commands.append('mv plotfile drawtree.ps')
commands.append('convert drawtree.ps drawtree.png') # use "convert"
for command in commands:
os.system(command)
Image(filename='drawtree.png')
Out[10]:
In [11]:
# draw tree with phylip "fitch" result
handle = open('outfile')
lines = handle.readlines()
handle.close()
start_index_str = 'Negative branch lengths not allowed\n'
end_index_str = 'remember: this is an unrooted tree!\n'
print '<Unrooted Tree>'
print ''.join(lines[lines.index(start_index_str)+1:
lines.index(end_index_str)])
Significant diff SNPs
use plink (consider all of heterozygosity as 1) up to manhattan plot PLINK case-control association test result("assoc") fields summaryCHR Chromosome SNP SNP ID BP Physical position (base-pair) A1 Minor allele name (based on whole sample) F_A Frequency of this allele in cases F_U Frequency of this allele in controls A2 Major allele name CHISQ Basic allelic test chi-square (1df) P Asymptotic p-value for this test OR Estimated odds ratio (for A1, i.e. A2 is reference)
In [12]:
# make PLINK input files
group1 = ['sample1', 'sample2', 'sample3', 'sample4', 'sample5', 'sample6']
group2 = ['sample7', 'sample8', 'sample9', 'sample10', 'sample11',
'sample12', 'sample13', 'sample14', 'sample15','sample16']
## make MAP file
data_plink = pd.read_csv('snp_chips.parsed.csv', index_col=2)
data_plink = data_plink.loc[data.index,:]
data_plink['id'] = data_plink.index
data_plink.drop_duplicates(cols='id', take_last=True, inplace=True)
data_map = data_plink[['chr', 'id', 'pos']]
data_map.to_csv('data.map', sep=' ', index=False, header=False)
## make PED file
data_plink = data_plink.drop(['chr', 'pos', 'id'], axis=1)
data_ped = data_plink.apply(lambda x: ' '.join('{0} {1}'.format(
snp[0], snp[-1]) for snp in x))
str_output = ''
for sample in data_ped.index:
str_output += '{0} {1} {2}\n'.format(sample, 1 if sample in group1
else 2, data_ped[sample])
handle = open('data.ped', 'w')
handle.write(str_output)
handle.close()
# execute plink
plink_command = 'plink --file data --assoc --no-fid --no-parents '\
'--allow-no-sex --no-sex --map3 --adjust'
os.system(plink_command)
Out[12]:
In [13]:
# draw Manhattan plot
import matplotlib.cm as cm
plink_map = pd.read_csv('data.map', header=None, sep=' ', index_col=1)
plink_map.columns = ['chr', 'pos']
handle = open('plink.assoc')
lines = handle.readlines()
handle.close()
plink_pvalue = pd.DataFrame([{'id': line.strip().split()[1],
'pvalue':line.strip().split()[8]} for line\
in lines[1:] if line.strip().split()[8] != 'NA' ])
plink_pvalue.index = plink_pvalue['id']
plink_map = plink_map.loc[plink_pvalue.index, :]
result = pd.concat([plink_map, plink_pvalue], join='outer', axis=1)
chr_x_axis = result.pivot_table(cols='chr', aggfunc='max')
chr_x_axis = chr_x_axis.loc['pos',].cumsum()
x_axis = result.apply(lambda x: (x['pos'] + chr_x_axis[x['chr']-1])\
if x['chr'] != 0else x['pos'], axis=1)
y_axis = pd.Series([-math.log(float(x), 10) for x in result['pvalue']])
fig, ax = plt.subplots(figsize=(15,4))
colors = cm.rainbow(np.linspace(0, 1, len(chr_x_axis)))
for i, chr in enumerate(chr_x_axis.index):
rt = result.loc[result['chr'] == chr, :]
x_axis = rt.apply(lambda x: (x['pos'] + chr_x_axis[x['chr']-1])\
if x['chr'] != 0else x['pos'], axis=1)
y_axis = pd.Series([-math.log(float(x), 10) for x in rt['pvalue']])
ax.plot(x_axis, y_axis, ls='*', marker='o',color=colors[i])
ax.set_xticks([0] + [i for i in chr_x_axis.values])
ax.set_xticklabels(['chr{}'.format(i) for i in chr_x_axis.index],
rotation=-45)
ax.grid(True)
ax.set_title('Manhattan plot')
ax.set_xlabel('chromosome (bp)')
ax.set_ylabel('pvalue (-log10)')
Out[13]:
SNPs diversity
ROD
\[ ROD = 1 - \pi_{cul} / \pi_{wild} \] \[ \pi = \Sigma_{ij}x_ix_j\pi_{ij} = 2* \Sigma_{i=1}^n\Sigma_{j=1}^{i-1}x_ix_j\pi_{ij} \]where \(x_i\) and \(x_j\) are the respective frequencies of the ith and jth sequences, \(\pi_{ij}\) is the number of nucleotide differences per nucleotide site between the ith and jth sequences, and n is the number of sequences in the sample.
- calculation example : http://www.vcu.edu/csbc/lfsc520/week7/Week7LectureNotes.pdf
selective sweep analysis\({}^1\)
We used allele counts at SNP positions to identify signatures of selection in sliding 40-kb windows, for pools of sequence data. For each pool and SNP, we determined the numbers of reads corresponding to the most and least abundant allele (\(n_{MAJ}\) and \(n_{MIN}\)). For each window in each breed pool, we calculated a pooled heterozygosity score: \(H_p = 2\Sigma n_{MAJ}\Sigma n_{MIN}/ (\Sigma n_{MAJ} + \Sigma n_{MIN}){}^2\), where SnMAJ and SnMIN are the sums of \(n_{MAJ}\) and, respectively, \(n_{MIN}\) for all SNPs in the window. Individual \(H_p\) values were then Z-transformed as follows: \(ZH_p = (H_p - \mu H_p)/ \sigma H_p\). \({}_{1. http://www.nature.com/nature/journal/v464/n7288/abs/nature08832.html}\)
In []: