msa-consensus-pssm-and-markov/sequence_profile.py at main · Raminyazdani/msa-consensus-pssm-and-markov · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import sys
from Bio.Align import MultipleSeqAlignment
import logomaker
from Bio import AlignIO
import csv
import os
import pandas as pd
from pathlib import Path

# Output file prefix - can be overridden via MSA_OUTPUT_PREFIX environment variable
OUTPUT_PREFIX = os.environ.get('MSA_OUTPUT_PREFIX', 'sequence_analysis')

# Get the project root directory (where this script is located)
SCRIPT_DIR = Path(__file__).resolve().parent


def calculate_sequence_profiles(consensus_file):
    """
        Function to calculate sequence profiles

        Parameters
        ----------
        consensus_file : str
            path to consensus file (clustal omega fasta output)

        Return
        ----------
        pssm : pssm file
            # Hint check imports

    """

    pssm = None

    # Read the MSA file
    alignment = AlignIO.read(consensus_file, "fasta")

    # Generate Position Specific Scoring Matrix manually
    # since AlignInfo is deprecated
    pssm = []
    for i in range(alignment.get_alignment_length()):
        column = alignment[:, i]
        # Count occurrences of each character
        counts = {}
        for char in column:
            if char != 'X':  # chars_to_ignore
                counts[char] = counts.get(char, 0) + 1
        pssm.append(counts)

    return pssm


def sequence_logo_creator(sequence_profile_csv):
    """
        Function to read sequence profile csv file, plot sequence logo and saves as png

        Parameters
        ----------
        sequence_profile_csv : str
            path to sequence profile csv

        Return
        ----------
        None

    """

    # Hint: You can use pandas dataframe

    logo = None

    # TODO read csv file and create sequence logo
    # read data
    data = pd.read_csv(sequence_profile_csv)

    # create logo
    logo = logomaker.Logo(data
                          , color_scheme='chemistry',
                          figsize=(50, 5),
                          stack_order='small_on_top')

    logo.fig.savefig(SCRIPT_DIR / f"{OUTPUT_PREFIX}_sequence_logo.png")


def csv_writer(pssm, output_file):
    """
        Function to write probabilities to csv file

        Parameters
        ----------
        pssm : pssm file
            returned pssm from calculate_sequence_profiles

        output_file : str
            path for output file

        Return
        ----------
        None

    """
    # Collect all unique characters across all positions
    all_chars = set()
    for pos in pssm:
        all_chars.update(pos.keys())

    # a sorted header of the alphabet
    header = sorted(list(all_chars))

    # change the count to probability and fill missing positions with 0
    for _, l in enumerate(pssm, start=1):
        sum_init = sum(l.values())
        # Add missing characters with 0 count
        for char in header:
            if char not in l:
                l[char] = 0
        # Convert counts to probabilities
        for k in l.keys():
            l[k] = l[k] / sum_init if sum_init > 0 else 0

    # write the pssm to a csv file
    with open(output_file, mode='w') as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        writer.writerows(pssm)

# Nothing to do here
if __name__ == "__main__":

    seq_profiles = calculate_sequence_profiles(sys.argv[1])
    # Example: seq_profiles = calculate_sequence_profiles("./msa_output.fasta")
    csv_writer(seq_profiles, SCRIPT_DIR / f"{OUTPUT_PREFIX}_sequence_profile.csv")
    sequence_logo_creator(SCRIPT_DIR / f"{OUTPUT_PREFIX}_sequence_profile.csv")