Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified __pycache__/load.pypy-22.pyc
Binary file not shown.
33,291 changes: 33,291 additions & 0 deletions data/genes segments for software_design_extension.txt

Large diffs are not rendered by default.

86 changes: 49 additions & 37 deletions gene_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,12 @@
import random
from amino_acids import aa, codons, aa_table # you may find these useful
from load import load_seq
from load import load_contigs
contigs = load_contigs()

mycontig = contigs[2][1]
newcontig = "".join(['A' if x not in 'ATGC' else x for x in mycontig])
# print newcontig

def shuffle_string(s):
"""Shuffles the characters in the input string
Expand All @@ -19,25 +24,31 @@ def shuffle_string(s):

# YOU WILL START YOUR IMPLEMENTATION FROM HERE DOWN ###


def get_complement(nucleotide):
""" Returns the complementary nucleotide

nucleotide: a nucleotide (A, C, G, or T) represented as a string
returns: the complementary nucleotide
>>> get_complement('A')
'T'
>>> get_complement('C')
'G'
"""
if nucleotide == 'A':
return 'T'
elif nucleotide == 'T':
return 'A'
elif nucleotide == 'C':
return 'G'
elif nucleotide == 'G':
return 'C'
# compl = {"A":"T","T":"A","C":"G","G":"C"}
# def get_complement(nucleotide):
# """ Returns the complementary nucleotide

# nucleotide: a nucleotide (A, C, G, or T) represented as a string
# returns: the complementary nucleotide
# >>> get_complement('A')
# 'T'
# >>> get_complement('C')
# 'G'
# """

# try:
# return compl[nucleotide]
# except KeyError:
# return "X"

# if nucleotide == 'A':
# return 'T'
# elif nucleotide == 'T':
# return 'A'
# elif nucleotide == 'C':
# return 'G'
# elif nucleotide == 'G':
# return 'C'

#print get_complement('C')

Expand All @@ -53,10 +64,12 @@ def get_reverse_complement(dna):
'TGAACGCGG'
"""
result = ""
compl = {"A":"T","T":"A","C":"G","G":"C"}
for i in dna:
result+=(get_complement(i))
result+=compl[i]
return result[::-1]


# print get_reverse_complement("CCGCGTTCA")

def rest_of_ORF(dna):
Expand All @@ -74,8 +87,7 @@ def rest_of_ORF(dna):
"""
stopcodon = ['TAG','TAA','TGA']
for i in range(0,len(dna)-2,3):
testcodon = dna[i:i+3] #dna is formatted as "ATGAGATAGG", so list slicing on dna gives a concatenated string
if testcodon in stopcodon:
if dna[i:i+3] in stopcodon: #dna is formatted as "ATGAGATAGG", so list slicing on dna gives a concatenated string
return dna[0:i]
return dna

Expand All @@ -97,10 +109,8 @@ def find_all_ORFs_oneframe(dna):
i = 0
stopcodon = ['TAG','TAA','TGA']
while i<len(dna)-2:
testframe = dna[i:i+3]
if testframe == "ATG": #Be very careful about the variable type! ["ATG"] is not the same as "ATG".
x = rest_of_ORF(dna[i:])
result.append(x)
if dna[i:i+3] == "ATG": #Be very careful about the variable type! ["ATG"] is not the same as "ATG".
result.append(rest_of_ORF(dna[i:]))
i = next((m for m in range(i+3,len(dna)-2, 3) if dna[m:m+3] in stopcodon), len(dna)-2)
# i = y
else:
Expand All @@ -125,7 +135,7 @@ def find_all_ORFs(dna):
"""
result = []
for i in range(0,3):
result = result + (find_all_ORFs_oneframe(dna[i:]))
result+=(find_all_ORFs_oneframe(dna[i:])) #find_all_ORFs outputs a list, and we want to add the elements and not append a list to another list.
return result

# print find_all_ORFs("ATGCATGAATGTAG")
Expand All @@ -152,16 +162,18 @@ def longest_ORF(dna):
>>> longest_ORF("ATGCGAATGTAGCATCAAA")
'ATGCTACATTCGCAT'
"""
elementArray = []
elementLengthArray = []
store = {}
largestlength = 0
for i in find_all_ORFs_both_strands(dna):
elementArray.append(i)
elementLengthArray.append(len(i))
return [x for x in elementArray if len(x) == max(elementLengthArray)]
if len(i)>largestlength:
store[len(i)] = i
largestlength = len(i)
return store[largestlength]

# print longest_ORF("ATGCGAATGTAGCATCAAA")

dna = load_seq("/home/leon/GeneFinder/data/X73525.fa")
# dna = load_seq("/home/leon/GeneFinder/data/X73525.fa")
# print dna

def longest_ORF_noncoding(dna, num_trials):
""" Computes the maximum length of the longest ORF over num_trials shuffles
Expand All @@ -173,12 +185,12 @@ def longest_ORF_noncoding(dna, num_trials):

count = 0
for i in range(num_trials):
x = len(max(longest_ORF(shuffle_string(dna))))
x = len(longest_ORF(shuffle_string(dna)))
if x > count:
count = x
return count

# print longest_ORF_noncoding(dna, 1000)
# print longest_ORF_noncoding(newcontig, 1)


def coding_strand_to_AA(dna):
Expand Down Expand Up @@ -207,12 +219,12 @@ def gene_finder(dna):
dna: a DNA sequence
returns: a list of all amino acid sequences coded by the sequence dna.
"""
threshold = longest_ORF_noncoding(dna, 1000)
threshold = 1500#longest_ORF_noncoding(dna, 1)
orfList = find_all_ORFs_both_strands(dna)
aaList = [coding_strand_to_AA(c) for c in orfList if len(c)>threshold]
return aaList

print gene_finder(dna)
print gene_finder(newcontig)


# if __name__ == "__main__":
Expand Down
25 changes: 19 additions & 6 deletions load.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,21 @@ def extract_next_gene(metagenome_lines, next_line):
metagenome_lines[start_line:next_line]]),
next_line)

def load_contigs():
""" Loads the DNA contigs for a new bacterial communicty
returns: a list of DNA snippets consisting of (name, sequence)
tuples. The sequence is represented as an uppercase
string of nucleotides
"""
return load_metagenome_helper('genes segments for software_design_extension.txt')

def load_metagenome():
""" Loads a metagenome of a bacterial contig.
def load_metagenome_helper(metagenome_file):
""" Loads the metagenome stored in the specified file.
returns: a list of DNA snippets consisting of (name, sequence)
tuples. The sequence is represented as an uppercase
string of nucleotides
"""
f = open(path.join('.',
'data',
'3300000497.a_metagenome_phototrophic community.fna'),
'r')
f = open(path.join('.', 'data', metagenome_file), 'r')

metagenome_lines = f.readlines()
f.close()
Expand All @@ -86,3 +90,12 @@ def load_metagenome():
next_line)
snippets.append((label, dna.upper()))
return snippets


def load_metagenome():
""" Loads a metagenome of a bacterial contig.
returns: a list of DNA snippets consisting of (name, sequence)
tuples. The sequence is represented as an uppercase
string of nucleotides
"""
return load_metagenome_helper('3300000497.a_metagenome_phototrophic community.fna')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good- you fixed most of the issue from the last time you submitted this!

Binary file modified load.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions output.txt

Large diffs are not rendered by default.