Lesson 3 Jupyter Notebook Companion¶
In [ ]:
# Execute a command that has a cell output
print("Hello Jupyter")
Hello Jupyter
In [ ]:
# Execute a command that raises an error
print("Hello Jupyter", end=1)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) /Users/gilbartv/Documents/git/python-intro/exercise/jupyter-notebook/lesson-3-companion.ipynb Cell 3 line 2 <a href='vscode-notebook-cell:/Users/gilbartv/Documents/git/python-intro/exercise/jupyter-notebook/lesson-3-companion.ipynb#W1sZmlsZQ%3D%3D?line=0'>1</a> # Execute a command that raises an error ----> <a href='vscode-notebook-cell:/Users/gilbartv/Documents/git/python-intro/exercise/jupyter-notebook/lesson-3-companion.ipynb#W1sZmlsZQ%3D%3D?line=1'>2</a> print("Hello Jupyter", end=1) TypeError: end must be None or a string, not int
Recap of lesson 2¶
We are following the exercise available in lesson 2 of the python introduction for PhD IMCBio course.
The dataset studied describes the percent of A, T, C, G in 5 sequences:
- seq1
- seq2
- seq3
- seq4
- seq5
Setup¶
We import the dataset as a pandas datatable and reformat it.
In [ ]:
import pandas as pd
df = pd.read_csv('exercise/data/example.txt', index_col=0, sep=' ')
df = pd.melt(df, var_name='nucl', value_name='freq', ignore_index=False)
Description of dataset¶
Here are some basic information about the dataset to understand its content.
In [ ]:
df.index
Out[ ]:
Index(['seq1', 'seq2', 'seq3', 'seq4', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4',
'seq5', 'seq1', 'seq2', 'seq3', 'seq4', 'seq5', 'seq1', 'seq2', 'seq3',
'seq4', 'seq5'],
dtype='object', name='Seq')
In [ ]:
df.columns
Out[ ]:
Index(['nucl', 'freq'], dtype='object')
In [ ]:
df.head()
Out[ ]:
| nucl | freq | |
|---|---|---|
| Seq | ||
| seq1 | A | 0.46 |
| seq2 | A | 0.20 |
| seq3 | A | 0.16 |
| seq4 | A | 0.18 |
| seq5 | A | 0.26 |
Statistics¶
We will calculate the mean value frequencies of all nucleotides.
In [ ]:
df['freq'].mean()
Out[ ]:
0.25
We will calculate the mean value frequencies per nucleotides.
In [ ]:
df.groupby("nucl")[['freq']].mean()
Out[ ]:
| freq | |
|---|---|
| nucl | |
| A | 0.252 |
| C | 0.252 |
| G | 0.256 |
| T | 0.240 |
Testing magic commands¶
In [ ]:
# %load exercise/script/analyse_fasta.py
#!/usr/bin/env python3
import sys
def nucl_freq(seq):
if not isinstance(seq, str):
raise TypeError("Input must be a string.")
valid_nucl = {"A", "T", "C", "G"}
seq_nucl = set(seq)
if seq_nucl.difference(valid_nucl) != set():
raise ValueError("Input string must only contain characters A, C, T or G.")
n = len(seq)
freq = dict()
for nucl in valid_nucl:
freq[nucl] = seq.count(nucl)/n
return freq
def analyse_fasta(input_file, output_file):
freq = {}
with open(input_file, 'r') as input:
for line in input:
if line.startswith(">"):
sequence_name = line.strip()[1:]
else:
current_sequence = line.strip()
freq[sequence_name] = nucl_freq(current_sequence)
with open(output_file, 'w') as output:
output.write("Seq A T C G\n")
for key, value in freq.items():
output.write(f"{key} {value.get('A')} {value.get('T')} {value.get('C')} {value.get('G')}\n")
return None
In [ ]:
nucl_freq("AACTTG")
Out[ ]:
{'T': 0.3333333333333333,
'C': 0.16666666666666666,
'G': 0.16666666666666666,
'A': 0.3333333333333333}
In [ ]:
%whos
del nucl_freq, analyse_fasta
%whos
Variable Type Data/Info
------------------------------------------------
analyse_fasta function <function analyse_fasta at 0x132eb7100>
f function <function f at 0x10589a840>
fig Figure Figure({\n 'data': [{'<...>': {'text': 'freq'}}}\n})
interact _InteractFactory <ipywidgets.widgets.inter<...>ry object at 0x105887a50>
interactive_plot function <function interactive_plot at 0x113baccc0>
nucl_freq function <function nucl_freq at 0x132eb7740>
pd module <module 'pandas' from '/U<...>ages/pandas/__init__.py'>
plt module <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
px module <module 'plotly.express' <...>tly/express/__init__.py'>
sys module <module 'sys' (built-in)>
widgets module <module 'ipywidgets.widge<...>ets/widgets/__init__.py'>
Variable Type Data/Info
------------------------------------------------
f function <function f at 0x10589a840>
fig Figure Figure({\n 'data': [{'<...>': {'text': 'freq'}}}\n})
interact _InteractFactory <ipywidgets.widgets.inter<...>ry object at 0x105887a50>
interactive_plot function <function interactive_plot at 0x113baccc0>
pd module <module 'pandas' from '/U<...>ages/pandas/__init__.py'>
plt module <module 'matplotlib.pyplo<...>es/matplotlib/pyplot.py'>
px module <module 'plotly.express' <...>tly/express/__init__.py'>
sys module <module 'sys' (built-in)>
widgets module <module 'ipywidgets.widge<...>ets/widgets/__init__.py'>
In [ ]:
%run exercise/script/analyse_fasta.py
nucl_freq("AACTTG")
Out[ ]:
{'T': 0.3333333333333333,
'C': 0.16666666666666666,
'G': 0.16666666666666666,
'A': 0.3333333333333333}
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import interact
from ipywidgets import widgets
# Create a DataFrame from the given data
df = pd.read_csv('exercise/data/example.txt', index_col=0, sep=' ')
def interactive_plot(columns):
# Plot the data
fig, ax = plt.subplots(1, figsize=(5, 6))
bottom = pd.Series([0,0,0,0,0], index = df.index)
for col in columns:
ax.bar(df.index, df[col], label=col, bottom=bottom)
bottom += df[col]
# Add labels and legend
ax.set_xlabel('Sequences')
ax.set_ylabel('Frequency')
ax.legend(title='Nucleotides', bbox_to_anchor=(0, 1),
loc='lower left', ncols=4)
plt.show()
In [ ]:
interact(interactive_plot, columns=df.columns)
interactive(children=(Dropdown(description='columns', options=('A', 'T', 'C', 'G'), value='A'), Output()), _do…
Out[ ]:
<function __main__.interactive_plot(columns)>
In [ ]:
interact(interactive_plot, columns=widgets.SelectMultiple(
options=df.columns,
value=[df.columns[0]],
description='Fruits',
disabled=False
))
interactive(children=(SelectMultiple(description='Fruits', index=(0,), options=('A', 'T', 'C', 'G'), value=('A…
Out[ ]:
<function __main__.interactive_plot(columns)>
Plotly express¶
In [ ]:
import plotly.express as px
import pandas as pd
df = pd.read_csv('exercise/data/example.txt', sep=' ')
df = pd.melt(df, var_name='nucl', value_name='freq', id_vars=['Seq'])
fig = px.bar(df, x="Seq", y="freq", color = "nucl", title="Nucleotide frequency")
fig.show()