






















Study with the several resources on Docsity
Earn points by helping other students or get them with a premium plan
Prepare for your exams
Study with the several resources on Docsity
Earn points to download
Earn points by helping other students or get them with a premium plan
The process of DNA replication, focusing on the identification of the origin of replication (OriC) in a DNA sequence. the biology of DNA strands, the role of DNA polymerases, and the differences between prokaryotic and eukaryotic replication. It also covers the implications of leaving single-stranded DNA exposed and the potential mutations that can occur. The document concludes by suggesting a new algorithm for finding OriC based on genome-wide GC skew.
Typology: Lecture notes
1 / 30
This page cannot be seen from the preview
Don't miss anything!























The most common mutation type is calleddeanimation
def loadFasta(filename): """ Parses a classically formatted and possibly compressed FASTA file into a list of headers and fragment sequences for each sequence contained""" if (filename.endswith(".gz")): fp = gzip.open(filename, 'rb') else : fp = open(filename, 'rb') # split at headers data = fp.read().split('>') fp.close() # ignore whatever appears before the 1st header data.pop(0) headers = [] sequences = [] for sequence in data: lines = sequence.split('\n') headers.append(lines.pop(0)) # add an extra "+" to make string "1-referenced" sequences.append('+' + ''.join(lines)) return (headers, sequences)
def getStatsV1(sequence, start): halflen = len(sequence) // 2 terC = start + halflen # handle genome's circular nature if (terC > len(sequence)): terC = terC - len(sequence) + 1 total = { base: 0 for base in "ACGT" } forwardCount = { base: 0 for base in "ACGT" } reverseCount = { base: 0 for base in "ACGT" } for position in xrange(1,len(sequence)): base = sequence[position] total[base] += 1 if (terC > start): if position >= start and position < terC: forwardCount[base] += 1 else : reverseCount[base] += 1 else : if position >= start or position < terC: forwardCount[base] += 1 else : reverseCount[base] += 1 return {key: (total[key], forwardCount[key], reverseCount[key]) for key in total.iterkeys()}
def getStatsV2(sequence, start): halflen = len(sequence) // 2 terC = start + halflen # handle genome's circular nature if (terC > len(sequence)): terC = terC - len(sequence) + 1 stats = {} for base in "ACGT": total = sequence.count(base) if (terC > start): forwardCount = sequence[start:terC].count(base) reverseCount = total - forwardCount else : reverseCount = sequence[terC:start].count(base) forwardCount = total - reverseCount stats[base] = (total, forwardCount, reverseCount) return stats
import numpy
def getStatsV3(sequence, start): halflen = len(sequence) // 2 terC = start + halflen # handle genome's circular nature if (terC > len(sequence)): terC = terC - len(sequence) + 1 genome = numpy.fromstring(sequence, dtype="uint8") total = numpy.bincount(genome) if (terC > start): forwardCount = numpy.bincount(genome[start:terC]) reverseCount = total - forwardCount else : reverseCount = numpy.bincount(match[terC:start]) forwardCount = total - reverseCount return {b: (total[ord(b)],forwardCount[ord(b)],reverseCount[ord(b)]) for b in "ACGT"}
answer = getStatsV3(seq[0], oriCStart + oriOffset)
for base in "CGAT": total, forwardCount, reverseCount = answer[base] print "%s: %8d %8d %8d %8d" % (base, total, forwardCount, reverseCount, forwardCount - reverseCount) print
% timeit getStatsV2(seq[0], oriCStart + oriOffset) % timeit getStatsV3(seq[0], oriCStart + oriOffset)
10 loops, best of 3: 36.8 ms per loop 100 loops, best of 3: 14.7 ms per loop
j =
i
v = numpy.array(numpy.random.random(20) < 0.25, dtype="int8") s = numpy.concatenate(([0],v.cumsum())) print v print s print s[15] - s[5]
def GCSkew(sequence): half = len(sequence) // 2 full = len(sequence) genome = numpy.fromstring(sequence + sequence, dtype='uint8') matchC = numpy.concatenate(([0], numpy.array(genome == ord('C'), dtype="int8").cumsum())) matchG = numpy.concatenate(([0], numpy.array(genome == ord('G'), dtype="int8").cumsum())) matchGC = matchG - matchC skew = matchGC[half:half + full] - matchGC[0:full] + matchGC[full - half:2 ***** full - half] - matchGC[full:2 ***** full] return skew
% matplotlib inline import matplotlib import matplotlib.pyplot as plt
test = "+CATGGGCATCGGCCATACGCC" y = GCSkew(test) plt.figure(num=None, figsize=(24, 8), dpi=100) plt.ylim([ - 10,10]) plt.xticks(range(len(test)), [c for c in test]) result = plt.plot(range(len(y)), y)