fraggler.fraggler
1# TODO: add rere.py as testing 2# Make a script and print the results in the terminal and capture it! 3 4 5class bcolors: 6 OKBLUE = "\033[94m" 7 OKCYAN = "\033[96m" 8 OKGREEN = "\033[92m" 9 WARNING = "\033[93m" 10 FAIL = "\033[91m" 11 ENDC = "\033[0m" 12 UNDERLINE = "\033[4m" 13 14 15def print_green(text): 16 print(f"{bcolors.OKGREEN}[INFO]: {text}{bcolors.ENDC}") 17 18 19def print_warning(text): 20 print(f"{bcolors.WARNING}{bcolors.UNDERLINE}[WARNING]: {text}{bcolors.ENDC}") 21 22 23def print_fail(text): 24 print(f"{bcolors.FAIL}{bcolors.UNDERLINE}[ERROR]: {text}{bcolors.ENDC}") 25 26 27def print_blue(text): 28 print(f"{bcolors.OKBLUE}[SUMMARIZE]: {text}{bcolors.ENDC}") 29 30 31print_green(f"Starting fraggler, importing libraries...") 32 33import numpy as np 34from sklearn.preprocessing import PolynomialFeatures 35from sklearn.linear_model import LinearRegression 36from sklearn.pipeline import make_pipeline 37from sklearn.preprocessing import SplineTransformer 38from sklearn.metrics import mean_squared_error, r2_score 39from scipy.interpolate import UnivariateSpline 40from pathlib import Path 41from scipy import sparse 42from scipy.sparse import linalg 43from numpy.linalg import norm 44from Bio import SeqIO 45from scipy import signal 46import matplotlib.pyplot as plt 47import pandas as pd 48import altair as alt 49from lmfit.models import VoigtModel, GaussianModel, LorentzianModel 50import re 51import sys 52from datetime import datetime 53import argparse 54import warnings 55import platform 56import panel as pn 57import pandas_flavor as pf 58 59 60warnings.filterwarnings("ignore") 61# for windows user 62if platform.system() == "Windows": 63 matplotlib.use("agg") 64 65 66from .ladders import LADDERS 67 68### UTILITY FUNCTIONS ### 69@pf.register_dataframe_method 70def pivot_wider( 71 df_: pd.DataFrame, index: list, names_from: list, values_from: list 72) -> pd.DataFrame: 73 df = df_.pivot(index=index, columns=names_from, values=values_from).reset_index() 74 names = [[str(y) for y in x] for x in df.columns] 75 names = ["_".join(x).strip("_") for x in names] 76 df.columns = names 77 78 return df 79 80 81def baseline_arPLS(y, ratio=0.99, lam=100, niter=1000, full_output=False): 82 """ 83 Taken from: 84 https://stackoverflow.com/questions/29156532/python-baseline-correction-library 85 86 from this paper: 87 https://pubs.rsc.org/en/content/articlelanding/2015/AN/C4AN01061B#!divAbstract 88 """ 89 L = len(y) 90 91 diag = np.ones(L - 2) 92 D = sparse.spdiags([diag, -2 * diag, diag], [0, -1, -2], L, L - 2) 93 94 H = lam * D.dot(D.T) # The transposes are flipped w.r.t the Algorithm on pg. 252 95 96 w = np.ones(L) 97 W = sparse.spdiags(w, 0, L, L) 98 99 crit = 1 100 count = 0 101 102 while crit > ratio: 103 z = linalg.spsolve(W + H, W * y) 104 d = y - z 105 dn = d[d < 0] 106 107 m = np.mean(dn) 108 s = np.std(dn) 109 110 w_new = 1 / (1 + np.exp(2 * (d - (2 * s - m)) / s)) 111 112 crit = norm(w_new - w) / norm(w) 113 114 w = w_new 115 W.setdiag(w) # Do not create a new matrix, just update diagonal values 116 117 count += 1 118 119 if count > niter: 120 print("Maximum number of iterations exceeded") 121 break 122 123 if full_output: 124 info = {"num_iter": count, "stop_criterion": crit} 125 return z, d, info 126 else: 127 return z 128 129 130def make_dir(outpath: str) -> None: 131 outpath = Path(outpath) 132 if not outpath.exists(): 133 outpath.mkdir(parents=True) 134 135 136def get_files(in_path: str) -> list[Path]: 137 # If in_path is a directory, get a list of all .fsa files in it 138 if Path(in_path).is_dir(): 139 files = [x for x in Path(in_path).iterdir() if x.suffix == ".fsa"] 140 else: 141 files = [Path(in_path)] 142 return files 143 144 145def file_exists(file): 146 if not Path(file).exists(): 147 print_fail(f"{file} does not exist!") 148 sys.exit(1) 149 150 151def folder_exists(folder): 152 if Path(folder).exists(): 153 print_fail(f"{folder} already exist!") 154 sys.exit(1) 155 156 157ASCII_ART = f"""{bcolors.OKBLUE} 158 █████▒██▀███ ▄▄▄ ▄████ ▄████ ██▓ ▓█████ ██▀███ 159 ▓██ ▒▓██ ▒ ██▒▒████▄ ██▒ ▀█▒ ██▒ ▀█▒▓██▒ ▓█ ▀ ▓██ ▒ ██▒ 160 ▒████ ░▓██ ░▄█ ▒▒██ ▀█▄ ▒██░▄▄▄░▒██░▄▄▄░▒██░ ▒███ ▓██ ░▄█ ▒ 161 ░▓█▒ ░▒██▀▀█▄ ░██▄▄▄▄██ ░▓█ ██▓░▓█ ██▓▒██░ ▒▓█ ▄ ▒██▀▀█▄ 162 ░▒█░ ░██▓ ▒██▒ ▓█ ▓██▒░▒▓███▀▒░▒▓███▀▒░██████▒░▒████▒░██▓ ▒██▒ 163 ▒ ░ ░ ▒▓ ░▒▓░ ▒▒ ▓▒█░ ░▒ ▒ ░▒ ▒ ░ ▒░▓ ░░░ ▒░ ░░ ▒▓ ░▒▓░ 164 ░ ░▒ ░ ▒░ ▒ ▒▒ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ░ ░ ░▒ ░ ▒░ 165 ░ ░ ░░ ░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░░ ░ 166 ░ ░ ░ ░ ░ ░ ░ ░ ░ ░{bcolors.ENDC} 167""" 168 169 170### FSA ### 171 172 173class FsaFile: 174 def __init__( 175 self, 176 file: str, 177 ladder: str, 178 sample_channel: str, 179 min_distance_between_peaks: int, 180 min_size_standard_height: int, 181 normalize: bool = False, 182 ) -> None: 183 self.file = Path(file) 184 self.file_name = self.file.parts[-1] 185 186 if ladder not in LADDERS.keys(): 187 print_fail(f"'{ladder}' is not a valid ladder") 188 sys.exit(1) 189 self.ladder = ladder 190 self.fsa = SeqIO.read(file, "abi").annotations["abif_raw"] 191 self.sample_channel = sample_channel 192 self.normalize = normalize 193 194 self.ladder_steps = LADDERS[ladder]["sizes"] 195 self.n_ladder_peaks = self.ladder_steps.size 196 197 self.min_size_standard_height = min_size_standard_height 198 self.size_standard_channel = self.find_size_standard_channel() 199 self.min_distance_between_peaks = min_distance_between_peaks 200 self.max_peaks_allow_in_size_standard = self.n_ladder_peaks + 5 201 202 ### properties updateded by external functions 203 204 # size standard peaks 205 self.size_standard_peaks = None 206 self.maxium_allowed_distance_between_size_standard_peaks = None 207 self.best_size_standard_combinations = None 208 self.best_size_standard = None 209 210 # sample data with fitted basepairs 211 self.fitted_to_model = False 212 self.sample_data_with_basepairs = None 213 self.ladder_model = None 214 215 # updated by peak finder functions 216 self.sample_data_peaks_raw = None 217 self.identified_sample_data_peaks = None 218 self.found_peaks = False 219 220 # peak widths 221 self.sample_data_peak_widths = None 222 self.peaks_with_padding = None 223 224 # area peaks 225 self.fitted_area_peaks = None 226 227 if normalize: 228 self.size_standard = np.array( 229 baseline_arPLS(self.fsa[self.size_standard_channel]) 230 ) 231 self.sample_data = np.array(baseline_arPLS(self.fsa[sample_channel])) 232 else: 233 self.size_standard = np.array(self.fsa[self.size_standard_channel]) 234 self.sample_data = np.array(self.fsa[self.sample_channel]) 235 236 def find_size_standard_channel( 237 self, 238 degree=2, 239 ): 240 241 data_channels = [x for x in self.fsa.items() if "DATA" in x[0]] 242 best_r2 = 0 243 best_fitted = None 244 245 for i, data in enumerate(data_channels): 246 array = signal.find_peaks(data[1], height=self.min_size_standard_height)[0] 247 248 if len(array) < 12: 249 continue 250 X = np.arange(len(array)).reshape(-1, 1) 251 poly = PolynomialFeatures(degree=degree) 252 X_poly = poly.fit_transform(X) 253 model = LinearRegression().fit(X_poly, array) 254 predicted = model.predict(X_poly) 255 r2 = r2_score(array, predicted) 256 257 if r2 > best_r2: 258 best_fitted = data[0] 259 260 return best_fitted 261 262 def __repr__(self): 263 return f""" 264 Filename: {self.file_name} 265 Sample Channel: {self.sample_channel} 266 Size Standard Channel: {self.size_standard_channel} 267 Ladder Name: {self.ladder} 268 Number of Ladder Steps: {self.n_ladder_peaks} 269 Minimum Distance Between Peaks: {self.min_distance_between_peaks} 270 Minimum Size Standard Height: {self.min_size_standard_height} 271 Normalized Data: {self.normalize} 272 Ladder Steps: {self.ladder_steps} 273 Fitted to model: {self.fitted_to_model} 274 Found peaks: {self.found_peaks} 275 """ 276 277 278def find_size_standard_peaks(fsa): 279 found_peaks = signal.find_peaks( 280 fsa.size_standard, 281 height=fsa.min_size_standard_height, 282 distance=fsa.min_distance_between_peaks, 283 ) 284 285 peaks = found_peaks[0] 286 heights = found_peaks[1]["peak_heights"] 287 288 size_standard_peaks = ( 289 pd.DataFrame({"peaks": peaks, "heights": heights}) 290 .sort_values("heights", ascending=False) 291 .head(fsa.max_peaks_allow_in_size_standard) 292 .sort_values("peaks")["peaks"] 293 .to_numpy() 294 ) 295 fsa.size_standard_peaks = size_standard_peaks 296 return fsa 297 298 299def return_maxium_allowed_distance_between_size_standard_peaks(fsa, multiplier=2): 300 """ 301 Finds the average distance between peaks and 302 multiplies the number with multiplier 303 """ 304 peaks = fsa.size_standard_peaks 305 max_distance = int( 306 np.mean([peaks[i + 1] - peaks[i] for i in range(len(peaks) - 1)]) * multiplier 307 ) 308 fsa.maxium_allowed_distance_between_size_standard_peaks = max_distance 309 return fsa 310 311 312def generate_combinations(fsa): 313 """ 314 depth-first search 315 """ 316 a = fsa.size_standard_peaks 317 length = fsa.n_ladder_peaks 318 distance = fsa.maxium_allowed_distance_between_size_standard_peaks 319 memo = {} 320 321 def dfs(start, path_len, path): 322 if path_len == length: 323 return [[]] 324 325 if (start, path_len) in memo: 326 return memo[(start, path_len)] 327 328 result = [] 329 for i in range(start, len(a)): 330 if not path or abs(a[i] - path[-1]) <= distance: 331 for combo in dfs(i + 1, path_len + 1, path + [a[i]]): 332 result.append([a[i]] + combo) 333 334 memo[(start, path_len)] = result 335 return result 336 337 best_combinations = pd.DataFrame({"combinations": dfs(0, 0, [])}) 338 fsa.best_size_standard_combinations = best_combinations 339 return fsa 340 341 342def calculate_best_combination_of_size_standard_peaks(fsa): 343 """ 344 Finds the best size standard combination of all using UnivariateSpline and second derivative 345 """ 346 combinations = fsa.best_size_standard_combinations 347 348 best_combinations = ( 349 combinations.assign( 350 der=lambda x: [ 351 UnivariateSpline(fsa.ladder_steps, y, s=0).derivative(n=2) 352 for y in x.combinations 353 ] 354 ) 355 .assign(max_value=lambda x: [max(abs(y(fsa.ladder_steps))) for y in x.der]) 356 .sort_values("max_value", ascending=True) 357 ) 358 359 best_size_standard = np.array(best_combinations.head(1).combinations.squeeze()) 360 fsa.best_size_standard = best_size_standard 361 return fsa 362 363 364def fit_size_standard_to_ladder(fsa): 365 """ 366 Returns the FsaFile with updated model and datafram with sample data 367 and fitted baisepairs. 368 Increase the knots until every basepair is unique. 369 """ 370 best_combination = fsa.best_size_standard 371 n_knots = 3 372 for _ in range(20): 373 model = make_pipeline( 374 SplineTransformer(degree=2, n_knots=n_knots, extrapolation="continue"), 375 LinearRegression(fit_intercept=True), 376 ) 377 378 X = best_combination.reshape(-1, 1) 379 y = fsa.ladder_steps 380 model.fit(X, y) 381 382 sample_data_with_basepairs = ( 383 pd.DataFrame({"peaks": fsa.sample_data}) 384 .reset_index() 385 .rename(columns={"index": "time"}) 386 .assign(basepairs=lambda x: model.predict(x.time.to_numpy().reshape(-1, 1))) 387 .assign(basepairs=lambda x: x.basepairs.round(2)) 388 .loc[lambda x: x.basepairs >= 0] 389 ) 390 391 if ( 392 sample_data_with_basepairs.shape[0] 393 == sample_data_with_basepairs.basepairs.nunique() 394 ): 395 fsa.ladder_model = model 396 fsa.sample_data_with_basepairs = sample_data_with_basepairs 397 fsa.fitted_to_model = True 398 return fsa 399 else: 400 n_knots += 1 401 402 # if no model could be fit 403 fsa.fitted_to_model = False 404 return fsa 405 406 407### PLOTTING ### 408def plot_areas(fsa): 409 def plot_helper(identified_peaks, df): 410 peaks = [df.loc[lambda x: x.peak_name == p] for p in df.peak_name.unique()] 411 fig_areas, axs = plt.subplots( 412 1, len(peaks), sharey=True, figsize=(20, 10) 413 ) 414 # if there is only one peak 415 if len(peaks) == 1: 416 axs.plot(df.basepairs, df.peaks, "o") 417 axs.plot(df.basepairs, df.fitted) 418 axs.set_title(f"Peak 1 area: {df['amplitude'].iloc[0]: .1f}") 419 axs.grid() 420 # if more than one peak 421 else: 422 for i, ax in enumerate(axs): 423 ax.plot( 424 peaks[i].basepairs, 425 peaks[i].peaks, 426 "o", 427 ) 428 ax.plot(peaks[i].basepairs, peaks[i].fitted) 429 ax.set_title(f"Peak {i + 1} area: {peaks[i]['amplitude'].iloc[0]: .1f}") 430 ax.grid() 431 432 fig_areas.suptitle(f"Quotient: {df['quotient'].iloc[0]: .2f}") 433 fig_areas.legend(["Raw data", "Model"]) 434 fig_areas.supxlabel("Basepairs") 435 fig_areas.supylabel("Intensity") 436 plt.close() 437 438 return fig_areas 439 440 plots = [] 441 for a in fsa.identified_sample_data_peaks.assay.unique(): 442 443 identified_peaks = fsa.identified_sample_data_peaks.loc[lambda x: x.assay == a] 444 df = fsa.fitted_area_peaks.loc[lambda x: x.assay == a] 445 plot = plot_helper(identified_peaks, df) 446 plots.append(plot) 447 448 return plots 449 450 451def make_fsa_data_df(fsa) -> pd.DataFrame: 452 data_channels = [x[0] for x in fsa.fsa.items() if "DATA" in x[0]] 453 dfs = [] 454 for d in data_channels: 455 df = ( 456 pd.DataFrame() 457 .assign(data=fsa.fsa[d]) 458 .assign(channel=d) 459 .assign(time=lambda x: range(x.shape[0])) 460 ) 461 dfs.append(df) 462 return pd.concat(dfs) 463 464 465def plot_fsa_data(fsa) -> list: 466 alt.data_transformers.disable_max_rows() 467 df = make_fsa_data_df(fsa) 468 469 plots = [] 470 for channel in df.channel.unique(): 471 plot = ( 472 alt.Chart(df.loc[lambda x: x.channel == channel]) 473 .mark_line() 474 .encode( 475 alt.X("time:Q", title="Time"), 476 alt.Y("data:Q", title="Intensity"), 477 alt.Color("channel:N"), 478 ) 479 .properties( 480 width=800, 481 height=500, 482 ) 483 .interactive() 484 ) 485 plots.append((plot, channel)) 486 487 all_data = ( 488 alt.Chart(df) 489 .mark_line() 490 .encode( 491 alt.X("time:Q", title="Time"), 492 alt.Y("data:Q", title="Intensity"), 493 alt.Color("channel:N"), 494 ) 495 .properties( 496 width=800, 497 height=500, 498 ) 499 .interactive() 500 ) 501 plots.append((all_data, "All channels")) 502 return plots 503 504 505def plot_all_found_peaks(fsa): 506 fig_peaks = plt.figure(figsize=(20, 10)) 507 508 df = fsa.sample_data_peaks_raw.loc[ 509 lambda x: x.basepairs > fsa.identified_sample_data_peaks.basepairs.min() - 10 510 ].loc[lambda x: x.basepairs < fsa.identified_sample_data_peaks.basepairs.max() + 10] 511 512 plt.plot(df.basepairs, df.peaks) 513 plt.plot( 514 fsa.identified_sample_data_peaks.basepairs, 515 fsa.identified_sample_data_peaks.peaks, 516 "o", 517 ) 518 for x, y in zip( 519 fsa.identified_sample_data_peaks.basepairs, 520 fsa.identified_sample_data_peaks.peaks, 521 ): 522 plt.text(x, y, f"{round(x, 1)} bp") 523 524 plt.title(f"Channel: {fsa.sample_channel}") 525 plt.xticks(np.arange(df.basepairs.min(), df.basepairs.max(), 10), rotation=90) 526 plt.ylabel("intensity") 527 plt.xlabel("basepairs") 528 plt.grid() 529 plt.close() 530 531 return fig_peaks 532 533 534def plot_size_standard_peaks(fsa): 535 best_combination = fsa.best_size_standard 536 size_standard = fsa.size_standard 537 ladder_name = fsa.ladder 538 ladder_size = fsa.ladder_steps 539 540 fig_ladder_peaks = plt.figure(figsize=(20, 10)) 541 plt.plot(size_standard) 542 plt.plot(best_combination, size_standard[best_combination], "o") 543 plt.xlabel("Time") 544 plt.ylabel("Intensity") 545 plt.legend(["Size standard", "Peak (bp)"]) 546 plt.title(ladder_name) 547 plt.grid() 548 549 for peak, ladder in zip(best_combination, ladder_size): 550 plt.text(peak, size_standard[peak], ladder) 551 552 plt.close() 553 return fig_ladder_peaks 554 555 556def plot_model_fit(fsa): 557 ladder_size = fsa.ladder_steps 558 best_combination = fsa.best_size_standard 559 560 predicted = fsa.ladder_model.predict(best_combination.reshape(-1, 1)) 561 ladder_name = fsa.ladder 562 563 mse = mean_squared_error(ladder_size, predicted) 564 r2 = r2_score(ladder_size, predicted) 565 566 fig_model_fit = plt.figure(figsize=(20, 10)) 567 plt.plot(ladder_size, best_combination, "o") 568 plt.plot(predicted, best_combination, "x") 569 plt.xticks(np.arange(0, np.max(ladder_size), 50)) 570 plt.xlabel("bp") 571 plt.yticks(np.arange(0, np.max(best_combination), 500)) 572 plt.suptitle(ladder_name) 573 plt.title(f"Mean squared error: {mse: .3f}, R2: {r2: .5f}") 574 plt.legend(["True value", "Predicted value"]) 575 plt.grid() 576 577 plt.close() 578 return fig_model_fit 579 580 581### PEAK FINDING ### 582def find_peaks_agnostic( 583 fsa, 584 peak_height_sample_data: int, 585 min_ratio: float, 586 distance_between_assays: int, 587 search_peaks_start: int, 588): 589 590 peaks_dataframe = fsa.sample_data_with_basepairs.loc[ 591 lambda x: x.basepairs > search_peaks_start 592 ] 593 peaks_index, _ = signal.find_peaks( 594 peaks_dataframe.peaks, height=peak_height_sample_data 595 ) 596 597 identified_peaks = ( 598 peaks_dataframe.iloc[peaks_index] 599 .assign(peaks_index=peaks_index) 600 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 601 # separate the peaks into different assay groups depending on the distance 602 # between the peaks 603 .assign(difference=lambda x: x.basepairs.diff()) 604 .fillna(100) 605 .assign( 606 assay=lambda x: np.select( 607 [x.difference > distance_between_assays], 608 [x.peak_name * 10], 609 default=pd.NA, 610 ) 611 ) 612 .fillna(method="ffill") 613 .assign(max_peak=lambda x: x.groupby("assay")["peaks"].transform(np.max)) 614 .assign(ratio=lambda x: x.peaks / x.max_peak) 615 .loc[lambda x: x.ratio > min_ratio] 616 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 617 ) 618 619 if identified_peaks.shape[0] == 0: 620 fsa.found_peaks = "error" 621 return fsa 622 623 fsa.sample_data_peaks_raw = peaks_dataframe 624 fsa.identified_sample_data_peaks = identified_peaks 625 fsa.found_peaks = "agnostic" 626 627 return fsa 628 629 630def read_custom_peaks(custom_peaks): 631 custom_peaks = ( 632 custom_peaks.fillna(0) 633 if isinstance(custom_peaks, pd.DataFrame) 634 else pd.read_csv(custom_peaks).fillna(0) 635 if isinstance(custom_peaks, str) 636 else None 637 ) 638 if not isinstance(custom_peaks, pd.DataFrame): 639 print_fail("No custom peaks could be read") 640 sys.exit(1) 641 642 return custom_peaks 643 644 645def custom_peaks_are_overlapping(custom_peaks): 646 df = read_custom_peaks(custom_peaks) 647 test = ( 648 df.sort_values("start") 649 .assign(intervals=lambda x: [range(y.start, y.stop) for y in x.itertuples()]) 650 .explode("intervals") 651 ) 652 653 if test.shape[0] != test.intervals.nunique(): 654 dups = ( 655 test.value_counts("intervals") 656 .reset_index() 657 .sort_values("intervals") 658 .loc[lambda x: x["count"] > 1] 659 .iloc[0, 0] 660 ) 661 print_fail( 662 f"Custom peaks contains overlapping ranges starting at value: {dups}" 663 ) 664 sys.exit(1) 665 666 667def custom_peaks_has_columns(custom_peaks): 668 df = read_custom_peaks(custom_peaks) 669 670 columns = set( 671 ["name", "start", "stop", "amount", "min_ratio", "which", "peak_distance"] 672 ) 673 df_columns = set(df.columns) 674 if len(columns) != len(df_columns): 675 print_fail(f"Customized peaks table does not contain the right columns.") 676 print_fail(f"Current columns: {df_columns}, Needed columns: {columns}") 677 sys.exit(1) 678 679 intersection = columns.intersection(df_columns) 680 if len(intersection) != len(df_columns): 681 print_fail(f"Customized peaks table does not contain the right columns.") 682 print_fail(f"Current columns: {df_columns}, Needed columns: {columns}") 683 sys.exit(1) 684 685 686def find_peaks_customized( 687 fsa, 688 custom_peaks, 689 peak_height_sample_data: int, 690 search_peaks_start: int, 691): 692 custom_peaks = read_custom_peaks(custom_peaks) 693 694 peaks_dataframe = fsa.sample_data_with_basepairs.loc[ 695 lambda x: x.basepairs > search_peaks_start 696 ] 697 peaks_index, _ = signal.find_peaks( 698 peaks_dataframe.peaks, height=peak_height_sample_data 699 ) 700 701 # Filter the df to get right peaks 702 identified_peaks = peaks_dataframe.iloc[peaks_index].assign(peaks_index=peaks_index) 703 # Filter the above df based on the custom peaks from the user 704 customized_peaks = [] 705 for assay in custom_peaks.itertuples(): 706 df = ( 707 identified_peaks.loc[lambda x: x.basepairs > assay.start] 708 .loc[lambda x: x.basepairs < assay.stop] 709 .assign(assay=assay.name) 710 ) 711 712 # Rank the peaks by height and filter out the smallest ones 713 if assay.amount != 0: 714 if assay.which == "LARGEST" or assay.which == "": 715 df = ( 716 df.assign(max_peak=lambda x: x.peaks.max()) 717 .assign(ratio=lambda x: x.peaks / x.max_peak) 718 .loc[lambda x: x.ratio > assay.min_ratio] 719 .assign(rank_peak=lambda x: x.peaks.rank(ascending=False)) 720 .loc[lambda x: x.rank_peak <= assay.amount] 721 .drop(columns=["rank_peak"]) 722 ) 723 if assay.peak_distance != 0: 724 df = ( 725 df.assign(distance=lambda x: x.basepairs.diff()) 726 .assign(distance=lambda x: x.distance.fillna(0)) 727 .loc[lambda x: x.distance <= assay.peak_distance] 728 .drop(columns=["distance"]) 729 ) 730 731 elif assay.which == "FIRST": 732 df = ( 733 df.assign(max_peak=lambda x: x.peaks.max()) 734 .assign(ratio=lambda x: x.peaks / x.max_peak) 735 .loc[lambda x: x.ratio > assay.min_ratio] 736 .sort_values("basepairs", ascending=True) 737 .head(assay.amount) 738 ) 739 if assay.peak_distance != 0: 740 df = ( 741 df.assign(distance=lambda x: x.basepairs.diff()) 742 .assign(distance=lambda x: x.distance.fillna(0)) 743 .loc[lambda x: x.distance <= assay.peak_distance] 744 .drop(columns=["distance"]) 745 ) 746 else: 747 print_fail("Column `which` must be `FIRST` or `LARGEST`") 748 exit(1) 749 750 customized_peaks.append(df) 751 752 identified_peaks = ( 753 pd.concat(customized_peaks) 754 .reset_index() 755 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 756 ) 757 758 if identified_peaks.shape[0] == 0: 759 fsa.found_peaks = "error" 760 return fsa 761 762 763 fsa.sample_data_peaks_raw = peaks_dataframe 764 fsa.identified_sample_data_peaks = identified_peaks 765 fsa.found_peaks = "custom_peaks" 766 767 return fsa 768 769 770### PEAK AREA ### 771def find_peak_widths(fsa, rel_height: float = 0.95): 772 widths = signal.peak_widths( 773 fsa.sample_data_peaks_raw.peaks, 774 fsa.identified_sample_data_peaks.peaks_index, 775 rel_height=rel_height, 776 ) 777 778 df = pd.DataFrame(widths).T 779 df.columns = ["x", "peak_height", "peak_start", "peak_end"] 780 781 peak_widths = ( 782 df.assign(peak_start=lambda x: np.floor(x.peak_start).astype(int)) 783 .assign(peak_end=lambda x: np.ceil(x.peak_end).astype(int)) 784 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 785 .merge(fsa.identified_sample_data_peaks, on="peak_name") 786 ) 787 788 fsa.sample_data_peak_widths = peak_widths 789 return fsa 790 791 792def find_peaks_with_padding(fsa, padding: int = 4): 793 # add some padding to the left and right to be sure to 794 # include everything in the peak 795 df = fsa.sample_data_peak_widths 796 divided_peak_widths = [df.loc[df.assay == x] for x in df.assay.unique()] 797 peaks_with_padding = [] 798 for assay in divided_peak_widths: 799 whole_peaks = [ 800 ( 801 fsa.sample_data_peaks_raw.iloc[ 802 x.peak_start - padding : x.peak_end + padding 803 ] 804 .assign(assay=x.assay) 805 .assign(peak_name=x.peak_name) 806 ) 807 for x in assay.itertuples() 808 ] 809 whole_peaks = pd.concat(whole_peaks) 810 peaks_with_padding.append(whole_peaks) 811 812 fsa.peaks_with_padding = pd.concat(peaks_with_padding) 813 return fsa 814 815 816def fit_lmfit_model_to_area(fsa, peak_finding_model: str): 817 if peak_finding_model == "gauss": 818 model = GaussianModel() 819 elif peak_finding_model == "voigt": 820 model = VoigtModel() 821 elif peak_finding_model == "lorentzian": 822 model = LorentzianModel() 823 else: 824 raise NotImplementedError( 825 f""" 826 {peak_finding_model} is not implemented! 827 Options: [gauss, voigt, lorentzian] 828 """ 829 ) 830 831 fitted_peaks = [] 832 for a in fsa.peaks_with_padding.assay.unique(): 833 assay = fsa.peaks_with_padding.loc[lambda x: x.assay == a] 834 for p in assay.peak_name.unique(): 835 peak_df = assay.loc[lambda x: x.peak_name == p] 836 y = peak_df.peaks.to_numpy() 837 x = peak_df.time.to_numpy() 838 params = model.guess(y, x) 839 out = model.fit(y, params, x=x) 840 841 fitted_peak = peak_df.assign( 842 fitted=out.best_fit, 843 model=peak_finding_model, 844 amplitude=out.values["amplitude"], 845 center=out.values["center"], 846 sigma=out.values["sigma"], 847 fwhm=out.values["fwhm"], 848 fit_report=out.fit_report(), 849 r_value=float( 850 re.findall(r"R-squared *= (0\.\d{3})", out.fit_report())[0] 851 ), 852 ) 853 fitted_peaks.append(fitted_peak) 854 855 fsa.fitted_area_peaks = pd.concat(fitted_peaks) 856 return fsa 857 858 859def calculate_quotients(fsa): 860 quotients = [] 861 for a in fsa.fitted_area_peaks.assay.unique(): 862 assay = fsa.fitted_area_peaks.loc[lambda x: x.assay == a] 863 df = ( 864 assay[["assay", "peak_name", "amplitude", "basepairs"]] 865 .assign(basepairs=lambda x: x.iloc[0, -1]) 866 .drop_duplicates() 867 ) 868 869 wide = df.pivot_wider( 870 index=["assay", "basepairs"], 871 names_from=["peak_name"], 872 values_from=["amplitude"], 873 ) 874 875 if wide.shape[1] > 4: 876 quotient = wide.assign( 877 quotient=lambda x: x.iloc[0, -1] / x.iloc[0, 2:4].mean() 878 ).quotient.squeeze() 879 elif wide.shape[1] == 3: 880 quotient = 0 881 else: 882 quotient = wide.assign( 883 quotient=lambda x: x.iloc[0, 3] / x.iloc[0, 2] 884 ).quotient.squeeze() 885 assay = assay.assign(quotient=quotient) 886 quotients.append(assay) 887 888 fsa.fitted_area_peaks = pd.concat(quotients) 889 return fsa 890 891 892def update_identified_sample_data_peaks(fsa): 893 a = fsa.identified_sample_data_peaks 894 b = fsa.fitted_area_peaks[["assay", "model", "r_value", "quotient"]] 895 896 a = a.merge(b).drop_duplicates("time") 897 fsa.identified_sample_data_peaks = a 898 return fsa 899 900 901def cli(): 902 parser = argparse.ArgumentParser( 903 description="Analyze your Fragment analysis files!" 904 ) 905 parser.add_argument( 906 "-t", 907 "--type", 908 type=str, 909 required=True, 910 choices=["area", "peak"], 911 help="Fraggler area or fraggler peak", 912 ) 913 parser.add_argument("-f", "--fsa", required=True, help="fsa file to analyze") 914 parser.add_argument( 915 "-o", 916 "--output", 917 required=True, 918 help="Output folder", 919 ) 920 valid_ladders = [x for x in LADDERS.keys()] 921 parser.add_argument( 922 "-l", 923 "--ladder", 924 choices=valid_ladders, 925 required=True, 926 help="Which ladder to use", 927 ) 928 parser.add_argument( 929 "-sc", 930 "--sample_channel", 931 required=True, 932 help="Which sample channel to use. E.g: 'DATA1', 'DATA2'...", 933 ) 934 parser.add_argument( 935 "-min_dist", 936 "--min_distance_between_peaks", 937 required=False, 938 type=int, 939 default=30, 940 help="Minimum distance between size standard peaks", 941 ) 942 parser.add_argument( 943 "-min_s_height", 944 "--min_size_standard_height", 945 required=False, 946 type=int, 947 default=100, 948 help="Minimun height of size standard peaks", 949 ) 950 parser.add_argument( 951 "-cp", 952 "--custom_peaks", 953 required=False, 954 default=None, 955 type=str, 956 help="csv file with custom peaks to find", 957 ) 958 parser.add_argument( 959 "-height_sample", 960 "--peak_height_sample_data", 961 required=False, 962 default=300, 963 type=int, 964 help="Minimum height of peaks in sample data", 965 ) 966 parser.add_argument( 967 "-min_ratio", 968 "--min_ratio_to_allow_peak", 969 required=False, 970 default=0.15, 971 type=float, 972 help="Minimum ratio of the lowest peak compared to the heighest peak in the assay", 973 ) 974 parser.add_argument( 975 "-distance", 976 "--distance_between_assays", 977 required=False, 978 default=15, 979 type=int, 980 help="Minimum distance between assays in a multiple assay experiment", 981 ) 982 parser.add_argument( 983 "-peak_start", 984 "--search_peaks_start", 985 required=False, 986 default=115, 987 type=int, 988 help="Where to start searching for peaks in basepairs", 989 ) 990 parser.add_argument( 991 "-m", 992 "--peak_area_model", 993 required=False, 994 default="gauss", 995 choices=["gauss", "voigt", "lorentzian"], 996 type=str, 997 help="Which peak finding model to use", 998 ) 999 1000 args = parser.parse_args() 1001 1002 print(ASCII_ART) 1003 command = "fraggler \n" + "".join(f"{k}: {v}\n" for k, v in vars(args).items()) 1004 print_green(command) 1005 1006 main( 1007 command, 1008 sub_command=args.type, 1009 fsa=args.fsa, 1010 output=args.output, 1011 ladder=args.ladder, 1012 sample_channel=args.sample_channel, 1013 min_distance_between_peaks=args.min_distance_between_peaks, 1014 min_size_standard_height=args.min_size_standard_height, 1015 custom_peaks=args.custom_peaks, 1016 peak_height_sample_data=args.peak_height_sample_data, 1017 min_ratio_to_allow_peak=args.min_ratio_to_allow_peak, 1018 distance_between_assays=args.distance_between_assays, 1019 search_peaks_start=args.search_peaks_start, 1020 peak_area_model=args.peak_area_model, 1021 ) 1022 1023 1024def write_log(file, *text): 1025 with open(file, "a+") as f: 1026 for line in text: 1027 print(line, file=f) 1028 1029 1030def read_valid_csv(csv): 1031 try: 1032 df = pd.read_csv(csv) 1033 return df 1034 except: 1035 print_fail(f"{csv} cannot be read!") 1036 sys.exit(1) 1037 1038 1039def parse_fsa( 1040 fsa, 1041 ladder, 1042 sample_channel, 1043 min_distance_between_peaks, 1044 min_size_standard_height, 1045): 1046 try: 1047 fsa = FsaFile( 1048 fsa, 1049 ladder, 1050 sample_channel=sample_channel, 1051 min_distance_between_peaks=min_distance_between_peaks, 1052 min_size_standard_height=min_size_standard_height, 1053 ) 1054 return fsa 1055 except: 1056 return None 1057 1058 1059### REPORT ### 1060pn.extension("tabulator") 1061pn.extension("vega", sizing_mode="stretch_width", template="fast") 1062pn.widgets.Tabulator.theme = "modern" 1063 1064 1065def header( 1066 text: str, 1067 bg_color: str = "#04c273", 1068 height: int = 300, 1069 fontsize: str = "px20", 1070 textalign: str = "center", 1071): 1072 """ 1073 Template for markdown header like block 1074 """ 1075 return pn.pane.Markdown( 1076 f""" 1077 {text} 1078 """, 1079 background=bg_color, 1080 height=height, 1081 margin=10, 1082 style={ 1083 "color": "white", 1084 "padding": "10px", 1085 "text-align": f"{textalign}", 1086 "font-size": f"{fontsize}", 1087 }, 1088 ) 1089 1090 1091def make_header(name: str, date: str) -> pn.pane.Markdown: 1092 return header( 1093 text=f""" 1094 # Fraggler Report 1095 ## Report of {name} 1096 ## Date: {date} 1097 """, 1098 fontsize="20px", 1099 bg_color="#03a1fc", 1100 height=250, 1101 ) 1102 1103 1104def generate_peak_report(fsa): 1105 ### ----- Raw Data ----- ### 1106 channel_header = header( 1107 text="## Plot of channels", 1108 bg_color="#04c273", 1109 height=80, 1110 textalign="left", 1111 ) 1112 # PLOT 1113 channel_tab = pn.Tabs() 1114 for plot, name in plot_fsa_data(fsa): 1115 pane = pn.pane.Vega(plot.interactive(), sizing_mode="stretch_both", name=name) 1116 channel_tab.append(pane) 1117 1118 channels_section = pn.Column(channel_header, channel_tab) 1119 1120 ### ----- Peaks ----- ### 1121 peaks_header = header( 1122 text="## Plot of Peaks", 1123 bg_color="#04c273", 1124 height=80, 1125 textalign="left", 1126 ) 1127 1128 # PLOT 1129 peaks_plot = plot_all_found_peaks(fsa) 1130 peaks_pane = pn.pane.Matplotlib(peaks_plot, name="Peaks") 1131 1132 # Section 1133 peaks_tab = pn.Tabs( 1134 peaks_pane, 1135 ) 1136 peaks_section = pn.Column(peaks_header, peaks_tab) 1137 1138 ### ----- Ladder Information ----- ### 1139 ladder_header = header( 1140 text="## Information about the ladder", 1141 bg_color="#04c273", 1142 height=80, 1143 textalign="left", 1144 ) 1145 # Ladder peak plot 1146 ladder_plot = plot_size_standard_peaks(fsa) 1147 ladder_peak_plot = pn.pane.Matplotlib( 1148 ladder_plot, 1149 name="Ladder Peak Plot", 1150 ) 1151 # Ladder Correlation 1152 model_fit = plot_model_fit(fsa) 1153 ladder_correlation_plot = pn.pane.Matplotlib( 1154 model_fit, 1155 name="Ladder Correlation Plot", 1156 ) 1157 1158 # Section 1159 ladder_tab = pn.Tabs( 1160 ladder_peak_plot, 1161 ladder_correlation_plot, 1162 ) 1163 ladder_section = pn.Column(ladder_header, ladder_tab) 1164 1165 ### ----- Peaks dataframe ----- ### 1166 dataframe_header = header( 1167 text="## Peaks Table", bg_color="#04c273", height=80, textalign="left" 1168 ) 1169 # Create dataframe 1170 df = fsa.identified_sample_data_peaks.assign(file_name=fsa.file_name)[ 1171 ["basepairs", "assay", "peak_name", "file_name"] 1172 ] 1173 # DataFrame Tabulator 1174 peaks_df_tab = pn.widgets.Tabulator( 1175 df, 1176 layout="fit_columns", 1177 pagination="local", 1178 page_size=15, 1179 show_index=False, 1180 name="Peaks Table", 1181 ) 1182 1183 # Section 1184 dataframe_tab = pn.Tabs(peaks_df_tab) 1185 dataframe_section = pn.Column(dataframe_header, dataframe_tab) 1186 1187 ### CREATE REPORT ### 1188 1189 file_name = fsa.file_name 1190 date = fsa.fsa["RUND1"] 1191 head = make_header(file_name, date) 1192 1193 all_tabs = pn.Tabs( 1194 ("Channels", channels_section), 1195 ("Peaks", peaks_section), 1196 ("Ladder", ladder_section), 1197 ("Peaks Table", dataframe_section), 1198 tabs_location="left", 1199 ) 1200 report = pn.Column( 1201 head, 1202 pn.layout.Divider(), 1203 all_tabs, 1204 ) 1205 1206 return report 1207 1208 1209def generate_area_report(fsa): 1210 1211 ### ----- Raw Data ----- ### 1212 channel_header = header( 1213 text="## Plot of channels", 1214 bg_color="#04c273", 1215 height=80, 1216 textalign="left", 1217 ) 1218 # PLOT 1219 channel_tab = pn.Tabs() 1220 for plot, name in plot_fsa_data(fsa): 1221 pane = pn.pane.Vega(plot.interactive(), sizing_mode="stretch_both", name=name) 1222 channel_tab.append(pane) 1223 1224 channels_section = pn.Column(channel_header, channel_tab) 1225 1226 ### ----- Peaks ----- ### 1227 peaks_header = header( 1228 text="## Plot of Peaks", 1229 bg_color="#04c273", 1230 height=80, 1231 textalign="left", 1232 ) 1233 1234 # PLOT 1235 peaks_plot = plot_all_found_peaks(fsa) 1236 peaks_pane = pn.pane.Matplotlib(peaks_plot, name="Peaks") 1237 1238 # Section 1239 peaks_tab = pn.Tabs( 1240 peaks_pane, 1241 ) 1242 peaks_section = pn.Column(peaks_header, peaks_tab) 1243 1244 ### ----- Ladder Information ----- ### 1245 ladder_header = header( 1246 text="## Information about the ladder", 1247 bg_color="#04c273", 1248 height=80, 1249 textalign="left", 1250 ) 1251 # Ladder peak plot 1252 ladder_plot = plot_size_standard_peaks(fsa) 1253 ladder_peak_plot = pn.pane.Matplotlib( 1254 ladder_plot, 1255 name="Ladder Peak Plot", 1256 ) 1257 # Ladder Correlation 1258 model_fit = plot_model_fit(fsa) 1259 ladder_correlation_plot = pn.pane.Matplotlib( 1260 model_fit, 1261 name="Ladder Correlation Plot", 1262 ) 1263 1264 # Section 1265 ladder_tab = pn.Tabs( 1266 ladder_peak_plot, 1267 ladder_correlation_plot, 1268 ) 1269 ladder_section = pn.Column(ladder_header, ladder_tab) 1270 1271 ### ----- Areas Information ----- ### 1272 areas_header = header( 1273 text="## Peak Areas", bg_color="#04c273", height=80, textalign="left" 1274 ) 1275 areas_tab = pn.Tabs() 1276 area_plots_list = plot_areas(fsa) 1277 for i, plot in enumerate(area_plots_list): 1278 name = f"Assay {i + 1}" 1279 plot_pane = pn.pane.Matplotlib(plot, name=name) 1280 areas_tab.append(plot_pane) 1281 1282 # Section 1283 areas_section = pn.Column(areas_header, areas_tab) 1284 1285 ### ----- Peaks DataFrame ----- ### 1286 dataframe_header = header( 1287 text="## Peaks Table", bg_color="#04c273", height=80, textalign="left" 1288 ) 1289 1290 df = fsa.identified_sample_data_peaks[ 1291 ["basepairs", "assay", "peak_name", "model", "r_value", "quotient"] 1292 ] 1293 1294 # DataFrame Tabulator 1295 peaks_df_tab = pn.widgets.Tabulator( 1296 df, 1297 layout="fit_columns", 1298 pagination="local", 1299 page_size=15, 1300 show_index=False, 1301 name="Peaks Table", 1302 ) 1303 1304 # Section 1305 dataframe_tab = pn.Tabs(peaks_df_tab) 1306 dataframe_section = pn.Column(dataframe_header, dataframe_tab) 1307 1308 ### CREATE REPORT ### 1309 file_name = fsa.file_name 1310 date = fsa.fsa["RUND1"] 1311 head = make_header(file_name, date) 1312 1313 all_tabs = pn.Tabs( 1314 ("Channels", channels_section), 1315 ("Peaks", peaks_section), 1316 ("Ladder", ladder_section), 1317 ("Areas", areas_section), 1318 ("Peak Table", dataframe_section), 1319 tabs_location="left", 1320 ) 1321 report = pn.Column( 1322 head, 1323 pn.layout.Divider(), 1324 all_tabs, 1325 ) 1326 1327 return report 1328 1329 1330def generate_no_peaks_report(fsa): 1331 channel_header = header( 1332 text="## Plot of channels", 1333 bg_color="#04c273", 1334 height=80, 1335 textalign="left", 1336 ) 1337 # PLOT 1338 channel_tab = pn.Tabs() 1339 for plot, name in plot_fsa_data(fsa): 1340 pane = pn.pane.Vega(plot.interactive(), sizing_mode="stretch_both", name=name) 1341 channel_tab.append(pane) 1342 channels_section = pn.Column(channel_header, channel_tab) 1343 1344 ### CREATE REPORT ### 1345 file_name = fsa.file_name 1346 date = fsa.fsa["RUND1"] 1347 head = header( 1348 "# No peaks could be generated. Please look at the raw data.", height=100 1349 ) 1350 1351 all_tabs = pn.Tabs( 1352 ("Channels", channels_section), 1353 tabs_location="left", 1354 ) 1355 report = pn.Column( 1356 head, 1357 pn.layout.Divider(), 1358 all_tabs, 1359 ) 1360 1361 return report 1362 1363 1364def main( 1365 command, 1366 sub_command, 1367 fsa, 1368 output, 1369 ladder, 1370 sample_channel, 1371 min_distance_between_peaks, 1372 min_size_standard_height, 1373 custom_peaks, 1374 peak_height_sample_data, 1375 min_ratio_to_allow_peak, 1376 distance_between_assays, 1377 search_peaks_start, 1378 peak_area_model, 1379): 1380 today = datetime.today().strftime('%Y-%m-%d %H:%M:%S') 1381 FAILED_FILES = [] 1382 PEAK_TABLES = [] 1383 start_time = datetime.now() 1384 print_green(f"Running fraggler {sub_command}!") 1385 1386 folder_exists(output) 1387 make_dir(output) 1388 1389 LOG_FILE = f"{output}/fraggler.log" 1390 write_log( 1391 LOG_FILE, 1392 f"Date: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}", 1393 "", 1394 "Command:", 1395 command, 1396 "---------------------------", 1397 ) 1398 1399 fsa_files = get_files(fsa) 1400 1401 for fsa in fsa_files: 1402 print_green(f" Running fraggler on {fsa}") 1403 write_log(LOG_FILE, f"Parsing {fsa}:") 1404 fsa_file = parse_fsa( 1405 fsa, 1406 ladder, 1407 sample_channel=sample_channel, 1408 min_distance_between_peaks=min_distance_between_peaks, 1409 min_size_standard_height=min_size_standard_height, 1410 ) 1411 1412 if fsa_file == None: 1413 print_fail(f"Could not parse fsa {fsa}") 1414 write_log( 1415 LOG_FILE, 1416 "Could not parse fsa", 1417 "Aborting", 1418 "", 1419 ) 1420 print_warning(f"Continuing to the next file...") 1421 print("") 1422 FAILED_FILES.append(fsa) 1423 continue 1424 1425 print_green(f"Using size standard channel: {fsa_file.size_standard_channel}") 1426 write_log(LOG_FILE, f"Size standard channel: {fsa_file.size_standard_channel}") 1427 1428 fsa = find_size_standard_peaks(fsa_file) 1429 1430 # if to few size standard peaks are found 1431 if len(fsa.size_standard_peaks) < len(fsa.ladder_steps): 1432 write_log( 1433 LOG_FILE, 1434 "To few size standard peaks found", 1435 "Aborting", 1436 "", 1437 ) 1438 print_warning("To few size standard peaks found") 1439 print_warning(f"Ladder peaks number: {len(fsa.ladder_steps)}") 1440 print_warning(f"Found size standard peaks: {len(fsa.size_standard_peaks)}") 1441 print_warning("Try changing the --min_size_standard_height") 1442 print_warning(f"Current value: {min_size_standard_height}") 1443 print_warning(f"...Or change ladder. Current ladder {ladder}") 1444 print_warning(f"Generating a report of the raw data, please have a look...") 1445 no_peaks_report = generate_no_peaks_report(fsa) 1446 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1447 print_warning(f"Continuing to the next file...") 1448 print("") 1449 FAILED_FILES.append(fsa.file_name) 1450 continue 1451 1452 1453 fsa = return_maxium_allowed_distance_between_size_standard_peaks( 1454 fsa, multiplier=2 1455 ) 1456 1457 # try to find a good number for allowed diffs between peaks 1458 for _ in range(20): 1459 fsa = generate_combinations(fsa) 1460 if fsa.best_size_standard_combinations.shape[0] > 0: 1461 break 1462 1463 fsa.maxium_allowed_distance_between_size_standard_peaks += 10 1464 1465 if fsa.best_size_standard_combinations.shape[0] == 0: 1466 write_log( 1467 LOG_FILE, 1468 "No combinations of the size standard could be made", 1469 "Aborting", 1470 "", 1471 ) 1472 print_warning("No combinations of the size standard could be made") 1473 print_warning("Try changing the --min_size_standard_height") 1474 print_warning(f"Current value: {min_size_standard_height}") 1475 print_warning(f"...Or change ladder. Current ladder {ladder}") 1476 print_warning(f"Generating a report of the raw data, please have a look...") 1477 no_peaks_report = generate_no_peaks_report(fsa) 1478 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1479 print_warning(f"Continuing to the next file...") 1480 print("") 1481 FAILED_FILES.append(fsa.file_name) 1482 continue 1483 1484 fsa = calculate_best_combination_of_size_standard_peaks(fsa) 1485 fsa = fit_size_standard_to_ladder(fsa) 1486 1487 # if no model could be fitted 1488 if not fsa.fitted_to_model: 1489 print_warning(f"No ladder model could be fitted to {fsa.file_name}...") 1490 print_warning("Try changing the --min_size_standard_height") 1491 print_warning(f"Current value: {min_size_standard_height}") 1492 write_log( 1493 LOG_FILE, 1494 "No ladder model could be fitted", 1495 "Aborting", 1496 "", 1497 ) 1498 print_warning(f"Generating a report of the raw data, please have a look...") 1499 no_peaks_report = generate_no_peaks_report(fsa) 1500 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1501 print_warning(f"Continuing to the next file...") 1502 print("") 1503 FAILED_FILES.append(fsa.file_name) 1504 continue 1505 1506 1507 if custom_peaks: 1508 print_green(f"Using custom peaks") 1509 # test if custom peaks are ok 1510 custom_peaks = read_valid_csv(custom_peaks) 1511 custom_peaks_are_overlapping(custom_peaks) 1512 custom_peaks_has_columns(custom_peaks) 1513 fsa = find_peaks_customized( 1514 fsa, 1515 custom_peaks, 1516 peak_height_sample_data=peak_height_sample_data, 1517 search_peaks_start=search_peaks_start, 1518 ) 1519 else: # find peak agnostic 1520 print_green(f"Finding peaks agnostic") 1521 fsa = find_peaks_agnostic( 1522 fsa, 1523 peak_height_sample_data=peak_height_sample_data, 1524 min_ratio=min_ratio_to_allow_peak, 1525 distance_between_assays=distance_between_assays, 1526 search_peaks_start=search_peaks_start, 1527 ) 1528 1529 # if no found peaks 1530 if fsa.found_peaks == "error": 1531 print_warning(f"No peaks could be detected for {fsa.file_name}...") 1532 print_warning("Try changing the --peak_height_sample_data") 1533 print_warning(f"Current value: {peak_height_sample_data}") 1534 write_log( 1535 LOG_FILE, 1536 "No peaks in sampel data channel could be identified", 1537 "Aborting", 1538 "", 1539 ) 1540 print_warning(f"Generating a report of the raw data, please have a look...") 1541 no_peaks_report = generate_no_peaks_report(fsa) 1542 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1543 print_warning(f"Continuing to the next file...") 1544 print("") 1545 FAILED_FILES.append(fsa.file_name) 1546 continue 1547 1548 print_blue(f"Found {fsa.identified_sample_data_peaks.assay.nunique()} assays") 1549 print_blue(f"Found {fsa.identified_sample_data_peaks.shape[0]} peaks") 1550 write_log( 1551 LOG_FILE, 1552 f"Found {fsa.identified_sample_data_peaks.assay.nunique()} assays", 1553 f"Found {fsa.identified_sample_data_peaks.shape[0]} peaks", 1554 ) 1555 1556 if sub_command == "peak": 1557 # save csv 1558 peak_table = ( 1559 fsa.identified_sample_data_peaks 1560 .assign(file_name=fsa.file_name) 1561 [["basepairs", "assay", "peak_name", "file_name"]] 1562 ) 1563 1564 PEAK_TABLES.append(peak_table) 1565 1566 # create peak report 1567 print_green("Creating peak report...") 1568 report = generate_peak_report(fsa) 1569 report.save(f"{output}/{fsa.file_name}_fraggler_report.html") 1570 1571 print_green(f"Fraggler done for {fsa.file_name}") 1572 print("") 1573 write_log(LOG_FILE, "") 1574 1575 if sub_command == "area": 1576 fsa = find_peak_widths(fsa) 1577 fsa = find_peaks_with_padding(fsa) 1578 fsa = fit_lmfit_model_to_area(fsa, peak_area_model) 1579 1580 if fsa.fitted_area_peaks.shape[0] == 0: 1581 print_warning(f"No areas could be fiitted for {fsa.file_name}...") 1582 write_log( 1583 LOG_FILE, 1584 "No areas could be fitted", 1585 "Aborting", 1586 "", 1587 ) 1588 print_warning(f"Generating a report of the raw data, please have a look...") 1589 no_peaks_report = generate_no_peaks_report(fsa) 1590 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1591 print_warning(f"Continuing to the next file...") 1592 print("") 1593 FAILED_FILES.append(fsa.file_name) 1594 continue 1595 1596 1597 fsa = calculate_quotients(fsa) 1598 fsa = update_identified_sample_data_peaks(fsa) 1599 1600 # add peaks table to list and concat later 1601 peak_table = ( 1602 fsa.identified_sample_data_peaks[ 1603 ["basepairs", "assay", "peak_name", "model", "r_value", "quotient"] 1604 ].assign(file_name=fsa.file_name) 1605 ) 1606 PEAK_TABLES.append(peak_table) 1607 1608 # create report 1609 print_green("Creating area report...") 1610 report = generate_area_report(fsa) 1611 report.save(f"{output}/{fsa.file_name}_fraggler_report.html") 1612 1613 print_green(f"Fraggler done for {fsa.file_name}") 1614 print("") 1615 write_log(LOG_FILE, "") 1616 1617 # save csv file in peak report 1618 if len(PEAK_TABLES) > 0: 1619 pd.concat(PEAK_TABLES).to_csv( 1620 f"{output}/fraggler_peaks.csv", 1621 index=False 1622 ) 1623 1624 print_green("Fraggler done!") 1625 1626 if len(FAILED_FILES) > 0: 1627 print_warning("Following files failed:") 1628 write_log(LOG_FILE, "Following files failed:") 1629 write_log(LOG_FILE, *FAILED_FILES) 1630 for file in FAILED_FILES: 1631 print_warning(f" {file}") 1632 1633 end_time = datetime.now() 1634 time_diff = end_time - start_time 1635 write_log( 1636 LOG_FILE, 1637 "---------------------------", 1638 f"Time runned : {time_diff}", 1639 "---------------------------" 1640 ) 1641 1642 1643 1644if __name__ == "__main__": 1645 cli() 1646 sys.exit(0)
class
bcolors:
def
print_green(text):
def
print_warning(text):
def
print_fail(text):
def
print_blue(text):
@pf.register_dataframe_method
def
pivot_wider( df_: pandas.core.frame.DataFrame, index: list, names_from: list, values_from: list) -> pandas.core.frame.DataFrame:
70@pf.register_dataframe_method 71def pivot_wider( 72 df_: pd.DataFrame, index: list, names_from: list, values_from: list 73) -> pd.DataFrame: 74 df = df_.pivot(index=index, columns=names_from, values=values_from).reset_index() 75 names = [[str(y) for y in x] for x in df.columns] 76 names = ["_".join(x).strip("_") for x in names] 77 df.columns = names 78 79 return df
def
baseline_arPLS(y, ratio=0.99, lam=100, niter=1000, full_output=False):
82def baseline_arPLS(y, ratio=0.99, lam=100, niter=1000, full_output=False): 83 """ 84 Taken from: 85 https://stackoverflow.com/questions/29156532/python-baseline-correction-library 86 87 from this paper: 88 https://pubs.rsc.org/en/content/articlelanding/2015/AN/C4AN01061B#!divAbstract 89 """ 90 L = len(y) 91 92 diag = np.ones(L - 2) 93 D = sparse.spdiags([diag, -2 * diag, diag], [0, -1, -2], L, L - 2) 94 95 H = lam * D.dot(D.T) # The transposes are flipped w.r.t the Algorithm on pg. 252 96 97 w = np.ones(L) 98 W = sparse.spdiags(w, 0, L, L) 99 100 crit = 1 101 count = 0 102 103 while crit > ratio: 104 z = linalg.spsolve(W + H, W * y) 105 d = y - z 106 dn = d[d < 0] 107 108 m = np.mean(dn) 109 s = np.std(dn) 110 111 w_new = 1 / (1 + np.exp(2 * (d - (2 * s - m)) / s)) 112 113 crit = norm(w_new - w) / norm(w) 114 115 w = w_new 116 W.setdiag(w) # Do not create a new matrix, just update diagonal values 117 118 count += 1 119 120 if count > niter: 121 print("Maximum number of iterations exceeded") 122 break 123 124 if full_output: 125 info = {"num_iter": count, "stop_criterion": crit} 126 return z, d, info 127 else: 128 return z
def
make_dir(outpath: str) -> None:
def
get_files(in_path: str) -> list[pathlib.Path]:
def
file_exists(file):
def
folder_exists(folder):
ASCII_ART =
'\x1b[94m\n █████▒██▀███ ▄▄▄ ▄████ ▄████ ██▓ ▓█████ ██▀███\n ▓██ ▒▓██ ▒ ██▒▒████▄ ██▒ ▀█▒ ██▒ ▀█▒▓██▒ ▓█ ▀ ▓██ ▒ ██▒\n ▒████ ░▓██ ░▄█ ▒▒██ ▀█▄ ▒██░▄▄▄░▒██░▄▄▄░▒██░ ▒███ ▓██ ░▄█ ▒\n ░▓█▒ ░▒██▀▀█▄ ░██▄▄▄▄██ ░▓█ ██▓░▓█ ██▓▒██░ ▒▓█ ▄ ▒██▀▀█▄\n ░▒█░ ░██▓ ▒██▒ ▓█ ▓██▒░▒▓███▀▒░▒▓███▀▒░██████▒░▒████▒░██▓ ▒██▒\n ▒ ░ ░ ▒▓ ░▒▓░ ▒▒ ▓▒█░ ░▒ ▒ ░▒ ▒ ░ ▒░▓ ░░░ ▒░ ░░ ▒▓ ░▒▓░\n ░ ░▒ ░ ▒░ ▒ ▒▒ ░ ░ ░ ░ ░ ░ ░ ▒ ░ ░ ░ ░ ░▒ ░ ▒░\n ░ ░ ░░ ░ ░ ▒ ░ ░ ░ ░ ░ ░ ░ ░ ░ ░░ ░\n ░ ░ ░ ░ ░ ░ ░ ░ ░ ░\x1b[0m\n'
class
FsaFile:
174class FsaFile: 175 def __init__( 176 self, 177 file: str, 178 ladder: str, 179 sample_channel: str, 180 min_distance_between_peaks: int, 181 min_size_standard_height: int, 182 normalize: bool = False, 183 ) -> None: 184 self.file = Path(file) 185 self.file_name = self.file.parts[-1] 186 187 if ladder not in LADDERS.keys(): 188 print_fail(f"'{ladder}' is not a valid ladder") 189 sys.exit(1) 190 self.ladder = ladder 191 self.fsa = SeqIO.read(file, "abi").annotations["abif_raw"] 192 self.sample_channel = sample_channel 193 self.normalize = normalize 194 195 self.ladder_steps = LADDERS[ladder]["sizes"] 196 self.n_ladder_peaks = self.ladder_steps.size 197 198 self.min_size_standard_height = min_size_standard_height 199 self.size_standard_channel = self.find_size_standard_channel() 200 self.min_distance_between_peaks = min_distance_between_peaks 201 self.max_peaks_allow_in_size_standard = self.n_ladder_peaks + 5 202 203 ### properties updateded by external functions 204 205 # size standard peaks 206 self.size_standard_peaks = None 207 self.maxium_allowed_distance_between_size_standard_peaks = None 208 self.best_size_standard_combinations = None 209 self.best_size_standard = None 210 211 # sample data with fitted basepairs 212 self.fitted_to_model = False 213 self.sample_data_with_basepairs = None 214 self.ladder_model = None 215 216 # updated by peak finder functions 217 self.sample_data_peaks_raw = None 218 self.identified_sample_data_peaks = None 219 self.found_peaks = False 220 221 # peak widths 222 self.sample_data_peak_widths = None 223 self.peaks_with_padding = None 224 225 # area peaks 226 self.fitted_area_peaks = None 227 228 if normalize: 229 self.size_standard = np.array( 230 baseline_arPLS(self.fsa[self.size_standard_channel]) 231 ) 232 self.sample_data = np.array(baseline_arPLS(self.fsa[sample_channel])) 233 else: 234 self.size_standard = np.array(self.fsa[self.size_standard_channel]) 235 self.sample_data = np.array(self.fsa[self.sample_channel]) 236 237 def find_size_standard_channel( 238 self, 239 degree=2, 240 ): 241 242 data_channels = [x for x in self.fsa.items() if "DATA" in x[0]] 243 best_r2 = 0 244 best_fitted = None 245 246 for i, data in enumerate(data_channels): 247 array = signal.find_peaks(data[1], height=self.min_size_standard_height)[0] 248 249 if len(array) < 12: 250 continue 251 X = np.arange(len(array)).reshape(-1, 1) 252 poly = PolynomialFeatures(degree=degree) 253 X_poly = poly.fit_transform(X) 254 model = LinearRegression().fit(X_poly, array) 255 predicted = model.predict(X_poly) 256 r2 = r2_score(array, predicted) 257 258 if r2 > best_r2: 259 best_fitted = data[0] 260 261 return best_fitted 262 263 def __repr__(self): 264 return f""" 265 Filename: {self.file_name} 266 Sample Channel: {self.sample_channel} 267 Size Standard Channel: {self.size_standard_channel} 268 Ladder Name: {self.ladder} 269 Number of Ladder Steps: {self.n_ladder_peaks} 270 Minimum Distance Between Peaks: {self.min_distance_between_peaks} 271 Minimum Size Standard Height: {self.min_size_standard_height} 272 Normalized Data: {self.normalize} 273 Ladder Steps: {self.ladder_steps} 274 Fitted to model: {self.fitted_to_model} 275 Found peaks: {self.found_peaks} 276 """
FsaFile( file: str, ladder: str, sample_channel: str, min_distance_between_peaks: int, min_size_standard_height: int, normalize: bool = False)
175 def __init__( 176 self, 177 file: str, 178 ladder: str, 179 sample_channel: str, 180 min_distance_between_peaks: int, 181 min_size_standard_height: int, 182 normalize: bool = False, 183 ) -> None: 184 self.file = Path(file) 185 self.file_name = self.file.parts[-1] 186 187 if ladder not in LADDERS.keys(): 188 print_fail(f"'{ladder}' is not a valid ladder") 189 sys.exit(1) 190 self.ladder = ladder 191 self.fsa = SeqIO.read(file, "abi").annotations["abif_raw"] 192 self.sample_channel = sample_channel 193 self.normalize = normalize 194 195 self.ladder_steps = LADDERS[ladder]["sizes"] 196 self.n_ladder_peaks = self.ladder_steps.size 197 198 self.min_size_standard_height = min_size_standard_height 199 self.size_standard_channel = self.find_size_standard_channel() 200 self.min_distance_between_peaks = min_distance_between_peaks 201 self.max_peaks_allow_in_size_standard = self.n_ladder_peaks + 5 202 203 ### properties updateded by external functions 204 205 # size standard peaks 206 self.size_standard_peaks = None 207 self.maxium_allowed_distance_between_size_standard_peaks = None 208 self.best_size_standard_combinations = None 209 self.best_size_standard = None 210 211 # sample data with fitted basepairs 212 self.fitted_to_model = False 213 self.sample_data_with_basepairs = None 214 self.ladder_model = None 215 216 # updated by peak finder functions 217 self.sample_data_peaks_raw = None 218 self.identified_sample_data_peaks = None 219 self.found_peaks = False 220 221 # peak widths 222 self.sample_data_peak_widths = None 223 self.peaks_with_padding = None 224 225 # area peaks 226 self.fitted_area_peaks = None 227 228 if normalize: 229 self.size_standard = np.array( 230 baseline_arPLS(self.fsa[self.size_standard_channel]) 231 ) 232 self.sample_data = np.array(baseline_arPLS(self.fsa[sample_channel])) 233 else: 234 self.size_standard = np.array(self.fsa[self.size_standard_channel]) 235 self.sample_data = np.array(self.fsa[self.sample_channel])
def
find_size_standard_channel(self, degree=2):
237 def find_size_standard_channel( 238 self, 239 degree=2, 240 ): 241 242 data_channels = [x for x in self.fsa.items() if "DATA" in x[0]] 243 best_r2 = 0 244 best_fitted = None 245 246 for i, data in enumerate(data_channels): 247 array = signal.find_peaks(data[1], height=self.min_size_standard_height)[0] 248 249 if len(array) < 12: 250 continue 251 X = np.arange(len(array)).reshape(-1, 1) 252 poly = PolynomialFeatures(degree=degree) 253 X_poly = poly.fit_transform(X) 254 model = LinearRegression().fit(X_poly, array) 255 predicted = model.predict(X_poly) 256 r2 = r2_score(array, predicted) 257 258 if r2 > best_r2: 259 best_fitted = data[0] 260 261 return best_fitted
def
find_size_standard_peaks(fsa):
279def find_size_standard_peaks(fsa): 280 found_peaks = signal.find_peaks( 281 fsa.size_standard, 282 height=fsa.min_size_standard_height, 283 distance=fsa.min_distance_between_peaks, 284 ) 285 286 peaks = found_peaks[0] 287 heights = found_peaks[1]["peak_heights"] 288 289 size_standard_peaks = ( 290 pd.DataFrame({"peaks": peaks, "heights": heights}) 291 .sort_values("heights", ascending=False) 292 .head(fsa.max_peaks_allow_in_size_standard) 293 .sort_values("peaks")["peaks"] 294 .to_numpy() 295 ) 296 fsa.size_standard_peaks = size_standard_peaks 297 return fsa
def
return_maxium_allowed_distance_between_size_standard_peaks(fsa, multiplier=2):
300def return_maxium_allowed_distance_between_size_standard_peaks(fsa, multiplier=2): 301 """ 302 Finds the average distance between peaks and 303 multiplies the number with multiplier 304 """ 305 peaks = fsa.size_standard_peaks 306 max_distance = int( 307 np.mean([peaks[i + 1] - peaks[i] for i in range(len(peaks) - 1)]) * multiplier 308 ) 309 fsa.maxium_allowed_distance_between_size_standard_peaks = max_distance 310 return fsa
Finds the average distance between peaks and multiplies the number with multiplier
def
generate_combinations(fsa):
313def generate_combinations(fsa): 314 """ 315 depth-first search 316 """ 317 a = fsa.size_standard_peaks 318 length = fsa.n_ladder_peaks 319 distance = fsa.maxium_allowed_distance_between_size_standard_peaks 320 memo = {} 321 322 def dfs(start, path_len, path): 323 if path_len == length: 324 return [[]] 325 326 if (start, path_len) in memo: 327 return memo[(start, path_len)] 328 329 result = [] 330 for i in range(start, len(a)): 331 if not path or abs(a[i] - path[-1]) <= distance: 332 for combo in dfs(i + 1, path_len + 1, path + [a[i]]): 333 result.append([a[i]] + combo) 334 335 memo[(start, path_len)] = result 336 return result 337 338 best_combinations = pd.DataFrame({"combinations": dfs(0, 0, [])}) 339 fsa.best_size_standard_combinations = best_combinations 340 return fsa
depth-first search
def
calculate_best_combination_of_size_standard_peaks(fsa):
343def calculate_best_combination_of_size_standard_peaks(fsa): 344 """ 345 Finds the best size standard combination of all using UnivariateSpline and second derivative 346 """ 347 combinations = fsa.best_size_standard_combinations 348 349 best_combinations = ( 350 combinations.assign( 351 der=lambda x: [ 352 UnivariateSpline(fsa.ladder_steps, y, s=0).derivative(n=2) 353 for y in x.combinations 354 ] 355 ) 356 .assign(max_value=lambda x: [max(abs(y(fsa.ladder_steps))) for y in x.der]) 357 .sort_values("max_value", ascending=True) 358 ) 359 360 best_size_standard = np.array(best_combinations.head(1).combinations.squeeze()) 361 fsa.best_size_standard = best_size_standard 362 return fsa
Finds the best size standard combination of all using UnivariateSpline and second derivative
def
fit_size_standard_to_ladder(fsa):
365def fit_size_standard_to_ladder(fsa): 366 """ 367 Returns the FsaFile with updated model and datafram with sample data 368 and fitted baisepairs. 369 Increase the knots until every basepair is unique. 370 """ 371 best_combination = fsa.best_size_standard 372 n_knots = 3 373 for _ in range(20): 374 model = make_pipeline( 375 SplineTransformer(degree=2, n_knots=n_knots, extrapolation="continue"), 376 LinearRegression(fit_intercept=True), 377 ) 378 379 X = best_combination.reshape(-1, 1) 380 y = fsa.ladder_steps 381 model.fit(X, y) 382 383 sample_data_with_basepairs = ( 384 pd.DataFrame({"peaks": fsa.sample_data}) 385 .reset_index() 386 .rename(columns={"index": "time"}) 387 .assign(basepairs=lambda x: model.predict(x.time.to_numpy().reshape(-1, 1))) 388 .assign(basepairs=lambda x: x.basepairs.round(2)) 389 .loc[lambda x: x.basepairs >= 0] 390 ) 391 392 if ( 393 sample_data_with_basepairs.shape[0] 394 == sample_data_with_basepairs.basepairs.nunique() 395 ): 396 fsa.ladder_model = model 397 fsa.sample_data_with_basepairs = sample_data_with_basepairs 398 fsa.fitted_to_model = True 399 return fsa 400 else: 401 n_knots += 1 402 403 # if no model could be fit 404 fsa.fitted_to_model = False 405 return fsa
Returns the FsaFile with updated model and datafram with sample data and fitted baisepairs. Increase the knots until every basepair is unique.
def
plot_areas(fsa):
409def plot_areas(fsa): 410 def plot_helper(identified_peaks, df): 411 peaks = [df.loc[lambda x: x.peak_name == p] for p in df.peak_name.unique()] 412 fig_areas, axs = plt.subplots( 413 1, len(peaks), sharey=True, figsize=(20, 10) 414 ) 415 # if there is only one peak 416 if len(peaks) == 1: 417 axs.plot(df.basepairs, df.peaks, "o") 418 axs.plot(df.basepairs, df.fitted) 419 axs.set_title(f"Peak 1 area: {df['amplitude'].iloc[0]: .1f}") 420 axs.grid() 421 # if more than one peak 422 else: 423 for i, ax in enumerate(axs): 424 ax.plot( 425 peaks[i].basepairs, 426 peaks[i].peaks, 427 "o", 428 ) 429 ax.plot(peaks[i].basepairs, peaks[i].fitted) 430 ax.set_title(f"Peak {i + 1} area: {peaks[i]['amplitude'].iloc[0]: .1f}") 431 ax.grid() 432 433 fig_areas.suptitle(f"Quotient: {df['quotient'].iloc[0]: .2f}") 434 fig_areas.legend(["Raw data", "Model"]) 435 fig_areas.supxlabel("Basepairs") 436 fig_areas.supylabel("Intensity") 437 plt.close() 438 439 return fig_areas 440 441 plots = [] 442 for a in fsa.identified_sample_data_peaks.assay.unique(): 443 444 identified_peaks = fsa.identified_sample_data_peaks.loc[lambda x: x.assay == a] 445 df = fsa.fitted_area_peaks.loc[lambda x: x.assay == a] 446 plot = plot_helper(identified_peaks, df) 447 plots.append(plot) 448 449 return plots
def
make_fsa_data_df(fsa) -> pandas.core.frame.DataFrame:
452def make_fsa_data_df(fsa) -> pd.DataFrame: 453 data_channels = [x[0] for x in fsa.fsa.items() if "DATA" in x[0]] 454 dfs = [] 455 for d in data_channels: 456 df = ( 457 pd.DataFrame() 458 .assign(data=fsa.fsa[d]) 459 .assign(channel=d) 460 .assign(time=lambda x: range(x.shape[0])) 461 ) 462 dfs.append(df) 463 return pd.concat(dfs)
def
plot_fsa_data(fsa) -> list:
466def plot_fsa_data(fsa) -> list: 467 alt.data_transformers.disable_max_rows() 468 df = make_fsa_data_df(fsa) 469 470 plots = [] 471 for channel in df.channel.unique(): 472 plot = ( 473 alt.Chart(df.loc[lambda x: x.channel == channel]) 474 .mark_line() 475 .encode( 476 alt.X("time:Q", title="Time"), 477 alt.Y("data:Q", title="Intensity"), 478 alt.Color("channel:N"), 479 ) 480 .properties( 481 width=800, 482 height=500, 483 ) 484 .interactive() 485 ) 486 plots.append((plot, channel)) 487 488 all_data = ( 489 alt.Chart(df) 490 .mark_line() 491 .encode( 492 alt.X("time:Q", title="Time"), 493 alt.Y("data:Q", title="Intensity"), 494 alt.Color("channel:N"), 495 ) 496 .properties( 497 width=800, 498 height=500, 499 ) 500 .interactive() 501 ) 502 plots.append((all_data, "All channels")) 503 return plots
def
plot_all_found_peaks(fsa):
506def plot_all_found_peaks(fsa): 507 fig_peaks = plt.figure(figsize=(20, 10)) 508 509 df = fsa.sample_data_peaks_raw.loc[ 510 lambda x: x.basepairs > fsa.identified_sample_data_peaks.basepairs.min() - 10 511 ].loc[lambda x: x.basepairs < fsa.identified_sample_data_peaks.basepairs.max() + 10] 512 513 plt.plot(df.basepairs, df.peaks) 514 plt.plot( 515 fsa.identified_sample_data_peaks.basepairs, 516 fsa.identified_sample_data_peaks.peaks, 517 "o", 518 ) 519 for x, y in zip( 520 fsa.identified_sample_data_peaks.basepairs, 521 fsa.identified_sample_data_peaks.peaks, 522 ): 523 plt.text(x, y, f"{round(x, 1)} bp") 524 525 plt.title(f"Channel: {fsa.sample_channel}") 526 plt.xticks(np.arange(df.basepairs.min(), df.basepairs.max(), 10), rotation=90) 527 plt.ylabel("intensity") 528 plt.xlabel("basepairs") 529 plt.grid() 530 plt.close() 531 532 return fig_peaks
def
plot_size_standard_peaks(fsa):
535def plot_size_standard_peaks(fsa): 536 best_combination = fsa.best_size_standard 537 size_standard = fsa.size_standard 538 ladder_name = fsa.ladder 539 ladder_size = fsa.ladder_steps 540 541 fig_ladder_peaks = plt.figure(figsize=(20, 10)) 542 plt.plot(size_standard) 543 plt.plot(best_combination, size_standard[best_combination], "o") 544 plt.xlabel("Time") 545 plt.ylabel("Intensity") 546 plt.legend(["Size standard", "Peak (bp)"]) 547 plt.title(ladder_name) 548 plt.grid() 549 550 for peak, ladder in zip(best_combination, ladder_size): 551 plt.text(peak, size_standard[peak], ladder) 552 553 plt.close() 554 return fig_ladder_peaks
def
plot_model_fit(fsa):
557def plot_model_fit(fsa): 558 ladder_size = fsa.ladder_steps 559 best_combination = fsa.best_size_standard 560 561 predicted = fsa.ladder_model.predict(best_combination.reshape(-1, 1)) 562 ladder_name = fsa.ladder 563 564 mse = mean_squared_error(ladder_size, predicted) 565 r2 = r2_score(ladder_size, predicted) 566 567 fig_model_fit = plt.figure(figsize=(20, 10)) 568 plt.plot(ladder_size, best_combination, "o") 569 plt.plot(predicted, best_combination, "x") 570 plt.xticks(np.arange(0, np.max(ladder_size), 50)) 571 plt.xlabel("bp") 572 plt.yticks(np.arange(0, np.max(best_combination), 500)) 573 plt.suptitle(ladder_name) 574 plt.title(f"Mean squared error: {mse: .3f}, R2: {r2: .5f}") 575 plt.legend(["True value", "Predicted value"]) 576 plt.grid() 577 578 plt.close() 579 return fig_model_fit
def
find_peaks_agnostic( fsa, peak_height_sample_data: int, min_ratio: float, distance_between_assays: int, search_peaks_start: int):
583def find_peaks_agnostic( 584 fsa, 585 peak_height_sample_data: int, 586 min_ratio: float, 587 distance_between_assays: int, 588 search_peaks_start: int, 589): 590 591 peaks_dataframe = fsa.sample_data_with_basepairs.loc[ 592 lambda x: x.basepairs > search_peaks_start 593 ] 594 peaks_index, _ = signal.find_peaks( 595 peaks_dataframe.peaks, height=peak_height_sample_data 596 ) 597 598 identified_peaks = ( 599 peaks_dataframe.iloc[peaks_index] 600 .assign(peaks_index=peaks_index) 601 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 602 # separate the peaks into different assay groups depending on the distance 603 # between the peaks 604 .assign(difference=lambda x: x.basepairs.diff()) 605 .fillna(100) 606 .assign( 607 assay=lambda x: np.select( 608 [x.difference > distance_between_assays], 609 [x.peak_name * 10], 610 default=pd.NA, 611 ) 612 ) 613 .fillna(method="ffill") 614 .assign(max_peak=lambda x: x.groupby("assay")["peaks"].transform(np.max)) 615 .assign(ratio=lambda x: x.peaks / x.max_peak) 616 .loc[lambda x: x.ratio > min_ratio] 617 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 618 ) 619 620 if identified_peaks.shape[0] == 0: 621 fsa.found_peaks = "error" 622 return fsa 623 624 fsa.sample_data_peaks_raw = peaks_dataframe 625 fsa.identified_sample_data_peaks = identified_peaks 626 fsa.found_peaks = "agnostic" 627 628 return fsa
def
read_custom_peaks(custom_peaks):
631def read_custom_peaks(custom_peaks): 632 custom_peaks = ( 633 custom_peaks.fillna(0) 634 if isinstance(custom_peaks, pd.DataFrame) 635 else pd.read_csv(custom_peaks).fillna(0) 636 if isinstance(custom_peaks, str) 637 else None 638 ) 639 if not isinstance(custom_peaks, pd.DataFrame): 640 print_fail("No custom peaks could be read") 641 sys.exit(1) 642 643 return custom_peaks
def
custom_peaks_are_overlapping(custom_peaks):
646def custom_peaks_are_overlapping(custom_peaks): 647 df = read_custom_peaks(custom_peaks) 648 test = ( 649 df.sort_values("start") 650 .assign(intervals=lambda x: [range(y.start, y.stop) for y in x.itertuples()]) 651 .explode("intervals") 652 ) 653 654 if test.shape[0] != test.intervals.nunique(): 655 dups = ( 656 test.value_counts("intervals") 657 .reset_index() 658 .sort_values("intervals") 659 .loc[lambda x: x["count"] > 1] 660 .iloc[0, 0] 661 ) 662 print_fail( 663 f"Custom peaks contains overlapping ranges starting at value: {dups}" 664 ) 665 sys.exit(1)
def
custom_peaks_has_columns(custom_peaks):
668def custom_peaks_has_columns(custom_peaks): 669 df = read_custom_peaks(custom_peaks) 670 671 columns = set( 672 ["name", "start", "stop", "amount", "min_ratio", "which", "peak_distance"] 673 ) 674 df_columns = set(df.columns) 675 if len(columns) != len(df_columns): 676 print_fail(f"Customized peaks table does not contain the right columns.") 677 print_fail(f"Current columns: {df_columns}, Needed columns: {columns}") 678 sys.exit(1) 679 680 intersection = columns.intersection(df_columns) 681 if len(intersection) != len(df_columns): 682 print_fail(f"Customized peaks table does not contain the right columns.") 683 print_fail(f"Current columns: {df_columns}, Needed columns: {columns}") 684 sys.exit(1)
def
find_peaks_customized( fsa, custom_peaks, peak_height_sample_data: int, search_peaks_start: int):
687def find_peaks_customized( 688 fsa, 689 custom_peaks, 690 peak_height_sample_data: int, 691 search_peaks_start: int, 692): 693 custom_peaks = read_custom_peaks(custom_peaks) 694 695 peaks_dataframe = fsa.sample_data_with_basepairs.loc[ 696 lambda x: x.basepairs > search_peaks_start 697 ] 698 peaks_index, _ = signal.find_peaks( 699 peaks_dataframe.peaks, height=peak_height_sample_data 700 ) 701 702 # Filter the df to get right peaks 703 identified_peaks = peaks_dataframe.iloc[peaks_index].assign(peaks_index=peaks_index) 704 # Filter the above df based on the custom peaks from the user 705 customized_peaks = [] 706 for assay in custom_peaks.itertuples(): 707 df = ( 708 identified_peaks.loc[lambda x: x.basepairs > assay.start] 709 .loc[lambda x: x.basepairs < assay.stop] 710 .assign(assay=assay.name) 711 ) 712 713 # Rank the peaks by height and filter out the smallest ones 714 if assay.amount != 0: 715 if assay.which == "LARGEST" or assay.which == "": 716 df = ( 717 df.assign(max_peak=lambda x: x.peaks.max()) 718 .assign(ratio=lambda x: x.peaks / x.max_peak) 719 .loc[lambda x: x.ratio > assay.min_ratio] 720 .assign(rank_peak=lambda x: x.peaks.rank(ascending=False)) 721 .loc[lambda x: x.rank_peak <= assay.amount] 722 .drop(columns=["rank_peak"]) 723 ) 724 if assay.peak_distance != 0: 725 df = ( 726 df.assign(distance=lambda x: x.basepairs.diff()) 727 .assign(distance=lambda x: x.distance.fillna(0)) 728 .loc[lambda x: x.distance <= assay.peak_distance] 729 .drop(columns=["distance"]) 730 ) 731 732 elif assay.which == "FIRST": 733 df = ( 734 df.assign(max_peak=lambda x: x.peaks.max()) 735 .assign(ratio=lambda x: x.peaks / x.max_peak) 736 .loc[lambda x: x.ratio > assay.min_ratio] 737 .sort_values("basepairs", ascending=True) 738 .head(assay.amount) 739 ) 740 if assay.peak_distance != 0: 741 df = ( 742 df.assign(distance=lambda x: x.basepairs.diff()) 743 .assign(distance=lambda x: x.distance.fillna(0)) 744 .loc[lambda x: x.distance <= assay.peak_distance] 745 .drop(columns=["distance"]) 746 ) 747 else: 748 print_fail("Column `which` must be `FIRST` or `LARGEST`") 749 exit(1) 750 751 customized_peaks.append(df) 752 753 identified_peaks = ( 754 pd.concat(customized_peaks) 755 .reset_index() 756 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 757 ) 758 759 if identified_peaks.shape[0] == 0: 760 fsa.found_peaks = "error" 761 return fsa 762 763 764 fsa.sample_data_peaks_raw = peaks_dataframe 765 fsa.identified_sample_data_peaks = identified_peaks 766 fsa.found_peaks = "custom_peaks" 767 768 return fsa
def
find_peak_widths(fsa, rel_height: float = 0.95):
772def find_peak_widths(fsa, rel_height: float = 0.95): 773 widths = signal.peak_widths( 774 fsa.sample_data_peaks_raw.peaks, 775 fsa.identified_sample_data_peaks.peaks_index, 776 rel_height=rel_height, 777 ) 778 779 df = pd.DataFrame(widths).T 780 df.columns = ["x", "peak_height", "peak_start", "peak_end"] 781 782 peak_widths = ( 783 df.assign(peak_start=lambda x: np.floor(x.peak_start).astype(int)) 784 .assign(peak_end=lambda x: np.ceil(x.peak_end).astype(int)) 785 .assign(peak_name=lambda x: range(1, x.shape[0] + 1)) 786 .merge(fsa.identified_sample_data_peaks, on="peak_name") 787 ) 788 789 fsa.sample_data_peak_widths = peak_widths 790 return fsa
def
find_peaks_with_padding(fsa, padding: int = 4):
793def find_peaks_with_padding(fsa, padding: int = 4): 794 # add some padding to the left and right to be sure to 795 # include everything in the peak 796 df = fsa.sample_data_peak_widths 797 divided_peak_widths = [df.loc[df.assay == x] for x in df.assay.unique()] 798 peaks_with_padding = [] 799 for assay in divided_peak_widths: 800 whole_peaks = [ 801 ( 802 fsa.sample_data_peaks_raw.iloc[ 803 x.peak_start - padding : x.peak_end + padding 804 ] 805 .assign(assay=x.assay) 806 .assign(peak_name=x.peak_name) 807 ) 808 for x in assay.itertuples() 809 ] 810 whole_peaks = pd.concat(whole_peaks) 811 peaks_with_padding.append(whole_peaks) 812 813 fsa.peaks_with_padding = pd.concat(peaks_with_padding) 814 return fsa
def
fit_lmfit_model_to_area(fsa, peak_finding_model: str):
817def fit_lmfit_model_to_area(fsa, peak_finding_model: str): 818 if peak_finding_model == "gauss": 819 model = GaussianModel() 820 elif peak_finding_model == "voigt": 821 model = VoigtModel() 822 elif peak_finding_model == "lorentzian": 823 model = LorentzianModel() 824 else: 825 raise NotImplementedError( 826 f""" 827 {peak_finding_model} is not implemented! 828 Options: [gauss, voigt, lorentzian] 829 """ 830 ) 831 832 fitted_peaks = [] 833 for a in fsa.peaks_with_padding.assay.unique(): 834 assay = fsa.peaks_with_padding.loc[lambda x: x.assay == a] 835 for p in assay.peak_name.unique(): 836 peak_df = assay.loc[lambda x: x.peak_name == p] 837 y = peak_df.peaks.to_numpy() 838 x = peak_df.time.to_numpy() 839 params = model.guess(y, x) 840 out = model.fit(y, params, x=x) 841 842 fitted_peak = peak_df.assign( 843 fitted=out.best_fit, 844 model=peak_finding_model, 845 amplitude=out.values["amplitude"], 846 center=out.values["center"], 847 sigma=out.values["sigma"], 848 fwhm=out.values["fwhm"], 849 fit_report=out.fit_report(), 850 r_value=float( 851 re.findall(r"R-squared *= (0\.\d{3})", out.fit_report())[0] 852 ), 853 ) 854 fitted_peaks.append(fitted_peak) 855 856 fsa.fitted_area_peaks = pd.concat(fitted_peaks) 857 return fsa
def
calculate_quotients(fsa):
860def calculate_quotients(fsa): 861 quotients = [] 862 for a in fsa.fitted_area_peaks.assay.unique(): 863 assay = fsa.fitted_area_peaks.loc[lambda x: x.assay == a] 864 df = ( 865 assay[["assay", "peak_name", "amplitude", "basepairs"]] 866 .assign(basepairs=lambda x: x.iloc[0, -1]) 867 .drop_duplicates() 868 ) 869 870 wide = df.pivot_wider( 871 index=["assay", "basepairs"], 872 names_from=["peak_name"], 873 values_from=["amplitude"], 874 ) 875 876 if wide.shape[1] > 4: 877 quotient = wide.assign( 878 quotient=lambda x: x.iloc[0, -1] / x.iloc[0, 2:4].mean() 879 ).quotient.squeeze() 880 elif wide.shape[1] == 3: 881 quotient = 0 882 else: 883 quotient = wide.assign( 884 quotient=lambda x: x.iloc[0, 3] / x.iloc[0, 2] 885 ).quotient.squeeze() 886 assay = assay.assign(quotient=quotient) 887 quotients.append(assay) 888 889 fsa.fitted_area_peaks = pd.concat(quotients) 890 return fsa
def
update_identified_sample_data_peaks(fsa):
def
cli():
902def cli(): 903 parser = argparse.ArgumentParser( 904 description="Analyze your Fragment analysis files!" 905 ) 906 parser.add_argument( 907 "-t", 908 "--type", 909 type=str, 910 required=True, 911 choices=["area", "peak"], 912 help="Fraggler area or fraggler peak", 913 ) 914 parser.add_argument("-f", "--fsa", required=True, help="fsa file to analyze") 915 parser.add_argument( 916 "-o", 917 "--output", 918 required=True, 919 help="Output folder", 920 ) 921 valid_ladders = [x for x in LADDERS.keys()] 922 parser.add_argument( 923 "-l", 924 "--ladder", 925 choices=valid_ladders, 926 required=True, 927 help="Which ladder to use", 928 ) 929 parser.add_argument( 930 "-sc", 931 "--sample_channel", 932 required=True, 933 help="Which sample channel to use. E.g: 'DATA1', 'DATA2'...", 934 ) 935 parser.add_argument( 936 "-min_dist", 937 "--min_distance_between_peaks", 938 required=False, 939 type=int, 940 default=30, 941 help="Minimum distance between size standard peaks", 942 ) 943 parser.add_argument( 944 "-min_s_height", 945 "--min_size_standard_height", 946 required=False, 947 type=int, 948 default=100, 949 help="Minimun height of size standard peaks", 950 ) 951 parser.add_argument( 952 "-cp", 953 "--custom_peaks", 954 required=False, 955 default=None, 956 type=str, 957 help="csv file with custom peaks to find", 958 ) 959 parser.add_argument( 960 "-height_sample", 961 "--peak_height_sample_data", 962 required=False, 963 default=300, 964 type=int, 965 help="Minimum height of peaks in sample data", 966 ) 967 parser.add_argument( 968 "-min_ratio", 969 "--min_ratio_to_allow_peak", 970 required=False, 971 default=0.15, 972 type=float, 973 help="Minimum ratio of the lowest peak compared to the heighest peak in the assay", 974 ) 975 parser.add_argument( 976 "-distance", 977 "--distance_between_assays", 978 required=False, 979 default=15, 980 type=int, 981 help="Minimum distance between assays in a multiple assay experiment", 982 ) 983 parser.add_argument( 984 "-peak_start", 985 "--search_peaks_start", 986 required=False, 987 default=115, 988 type=int, 989 help="Where to start searching for peaks in basepairs", 990 ) 991 parser.add_argument( 992 "-m", 993 "--peak_area_model", 994 required=False, 995 default="gauss", 996 choices=["gauss", "voigt", "lorentzian"], 997 type=str, 998 help="Which peak finding model to use", 999 ) 1000 1001 args = parser.parse_args() 1002 1003 print(ASCII_ART) 1004 command = "fraggler \n" + "".join(f"{k}: {v}\n" for k, v in vars(args).items()) 1005 print_green(command) 1006 1007 main( 1008 command, 1009 sub_command=args.type, 1010 fsa=args.fsa, 1011 output=args.output, 1012 ladder=args.ladder, 1013 sample_channel=args.sample_channel, 1014 min_distance_between_peaks=args.min_distance_between_peaks, 1015 min_size_standard_height=args.min_size_standard_height, 1016 custom_peaks=args.custom_peaks, 1017 peak_height_sample_data=args.peak_height_sample_data, 1018 min_ratio_to_allow_peak=args.min_ratio_to_allow_peak, 1019 distance_between_assays=args.distance_between_assays, 1020 search_peaks_start=args.search_peaks_start, 1021 peak_area_model=args.peak_area_model, 1022 )
def
write_log(file, *text):
def
read_valid_csv(csv):
def
parse_fsa( fsa, ladder, sample_channel, min_distance_between_peaks, min_size_standard_height):
1040def parse_fsa( 1041 fsa, 1042 ladder, 1043 sample_channel, 1044 min_distance_between_peaks, 1045 min_size_standard_height, 1046): 1047 try: 1048 fsa = FsaFile( 1049 fsa, 1050 ladder, 1051 sample_channel=sample_channel, 1052 min_distance_between_peaks=min_distance_between_peaks, 1053 min_size_standard_height=min_size_standard_height, 1054 ) 1055 return fsa 1056 except: 1057 return None
def
header( text: str, bg_color: str = '#04c273', height: int = 300, fontsize: str = 'px20', textalign: str = 'center'):
1066def header( 1067 text: str, 1068 bg_color: str = "#04c273", 1069 height: int = 300, 1070 fontsize: str = "px20", 1071 textalign: str = "center", 1072): 1073 """ 1074 Template for markdown header like block 1075 """ 1076 return pn.pane.Markdown( 1077 f""" 1078 {text} 1079 """, 1080 background=bg_color, 1081 height=height, 1082 margin=10, 1083 style={ 1084 "color": "white", 1085 "padding": "10px", 1086 "text-align": f"{textalign}", 1087 "font-size": f"{fontsize}", 1088 }, 1089 )
Template for markdown header like block
def
make_header(name: str, date: str) -> panel.pane.markup.Markdown:
def
generate_peak_report(fsa):
1105def generate_peak_report(fsa): 1106 ### ----- Raw Data ----- ### 1107 channel_header = header( 1108 text="## Plot of channels", 1109 bg_color="#04c273", 1110 height=80, 1111 textalign="left", 1112 ) 1113 # PLOT 1114 channel_tab = pn.Tabs() 1115 for plot, name in plot_fsa_data(fsa): 1116 pane = pn.pane.Vega(plot.interactive(), sizing_mode="stretch_both", name=name) 1117 channel_tab.append(pane) 1118 1119 channels_section = pn.Column(channel_header, channel_tab) 1120 1121 ### ----- Peaks ----- ### 1122 peaks_header = header( 1123 text="## Plot of Peaks", 1124 bg_color="#04c273", 1125 height=80, 1126 textalign="left", 1127 ) 1128 1129 # PLOT 1130 peaks_plot = plot_all_found_peaks(fsa) 1131 peaks_pane = pn.pane.Matplotlib(peaks_plot, name="Peaks") 1132 1133 # Section 1134 peaks_tab = pn.Tabs( 1135 peaks_pane, 1136 ) 1137 peaks_section = pn.Column(peaks_header, peaks_tab) 1138 1139 ### ----- Ladder Information ----- ### 1140 ladder_header = header( 1141 text="## Information about the ladder", 1142 bg_color="#04c273", 1143 height=80, 1144 textalign="left", 1145 ) 1146 # Ladder peak plot 1147 ladder_plot = plot_size_standard_peaks(fsa) 1148 ladder_peak_plot = pn.pane.Matplotlib( 1149 ladder_plot, 1150 name="Ladder Peak Plot", 1151 ) 1152 # Ladder Correlation 1153 model_fit = plot_model_fit(fsa) 1154 ladder_correlation_plot = pn.pane.Matplotlib( 1155 model_fit, 1156 name="Ladder Correlation Plot", 1157 ) 1158 1159 # Section 1160 ladder_tab = pn.Tabs( 1161 ladder_peak_plot, 1162 ladder_correlation_plot, 1163 ) 1164 ladder_section = pn.Column(ladder_header, ladder_tab) 1165 1166 ### ----- Peaks dataframe ----- ### 1167 dataframe_header = header( 1168 text="## Peaks Table", bg_color="#04c273", height=80, textalign="left" 1169 ) 1170 # Create dataframe 1171 df = fsa.identified_sample_data_peaks.assign(file_name=fsa.file_name)[ 1172 ["basepairs", "assay", "peak_name", "file_name"] 1173 ] 1174 # DataFrame Tabulator 1175 peaks_df_tab = pn.widgets.Tabulator( 1176 df, 1177 layout="fit_columns", 1178 pagination="local", 1179 page_size=15, 1180 show_index=False, 1181 name="Peaks Table", 1182 ) 1183 1184 # Section 1185 dataframe_tab = pn.Tabs(peaks_df_tab) 1186 dataframe_section = pn.Column(dataframe_header, dataframe_tab) 1187 1188 ### CREATE REPORT ### 1189 1190 file_name = fsa.file_name 1191 date = fsa.fsa["RUND1"] 1192 head = make_header(file_name, date) 1193 1194 all_tabs = pn.Tabs( 1195 ("Channels", channels_section), 1196 ("Peaks", peaks_section), 1197 ("Ladder", ladder_section), 1198 ("Peaks Table", dataframe_section), 1199 tabs_location="left", 1200 ) 1201 report = pn.Column( 1202 head, 1203 pn.layout.Divider(), 1204 all_tabs, 1205 ) 1206 1207 return report
def
generate_area_report(fsa):
1210def generate_area_report(fsa): 1211 1212 ### ----- Raw Data ----- ### 1213 channel_header = header( 1214 text="## Plot of channels", 1215 bg_color="#04c273", 1216 height=80, 1217 textalign="left", 1218 ) 1219 # PLOT 1220 channel_tab = pn.Tabs() 1221 for plot, name in plot_fsa_data(fsa): 1222 pane = pn.pane.Vega(plot.interactive(), sizing_mode="stretch_both", name=name) 1223 channel_tab.append(pane) 1224 1225 channels_section = pn.Column(channel_header, channel_tab) 1226 1227 ### ----- Peaks ----- ### 1228 peaks_header = header( 1229 text="## Plot of Peaks", 1230 bg_color="#04c273", 1231 height=80, 1232 textalign="left", 1233 ) 1234 1235 # PLOT 1236 peaks_plot = plot_all_found_peaks(fsa) 1237 peaks_pane = pn.pane.Matplotlib(peaks_plot, name="Peaks") 1238 1239 # Section 1240 peaks_tab = pn.Tabs( 1241 peaks_pane, 1242 ) 1243 peaks_section = pn.Column(peaks_header, peaks_tab) 1244 1245 ### ----- Ladder Information ----- ### 1246 ladder_header = header( 1247 text="## Information about the ladder", 1248 bg_color="#04c273", 1249 height=80, 1250 textalign="left", 1251 ) 1252 # Ladder peak plot 1253 ladder_plot = plot_size_standard_peaks(fsa) 1254 ladder_peak_plot = pn.pane.Matplotlib( 1255 ladder_plot, 1256 name="Ladder Peak Plot", 1257 ) 1258 # Ladder Correlation 1259 model_fit = plot_model_fit(fsa) 1260 ladder_correlation_plot = pn.pane.Matplotlib( 1261 model_fit, 1262 name="Ladder Correlation Plot", 1263 ) 1264 1265 # Section 1266 ladder_tab = pn.Tabs( 1267 ladder_peak_plot, 1268 ladder_correlation_plot, 1269 ) 1270 ladder_section = pn.Column(ladder_header, ladder_tab) 1271 1272 ### ----- Areas Information ----- ### 1273 areas_header = header( 1274 text="## Peak Areas", bg_color="#04c273", height=80, textalign="left" 1275 ) 1276 areas_tab = pn.Tabs() 1277 area_plots_list = plot_areas(fsa) 1278 for i, plot in enumerate(area_plots_list): 1279 name = f"Assay {i + 1}" 1280 plot_pane = pn.pane.Matplotlib(plot, name=name) 1281 areas_tab.append(plot_pane) 1282 1283 # Section 1284 areas_section = pn.Column(areas_header, areas_tab) 1285 1286 ### ----- Peaks DataFrame ----- ### 1287 dataframe_header = header( 1288 text="## Peaks Table", bg_color="#04c273", height=80, textalign="left" 1289 ) 1290 1291 df = fsa.identified_sample_data_peaks[ 1292 ["basepairs", "assay", "peak_name", "model", "r_value", "quotient"] 1293 ] 1294 1295 # DataFrame Tabulator 1296 peaks_df_tab = pn.widgets.Tabulator( 1297 df, 1298 layout="fit_columns", 1299 pagination="local", 1300 page_size=15, 1301 show_index=False, 1302 name="Peaks Table", 1303 ) 1304 1305 # Section 1306 dataframe_tab = pn.Tabs(peaks_df_tab) 1307 dataframe_section = pn.Column(dataframe_header, dataframe_tab) 1308 1309 ### CREATE REPORT ### 1310 file_name = fsa.file_name 1311 date = fsa.fsa["RUND1"] 1312 head = make_header(file_name, date) 1313 1314 all_tabs = pn.Tabs( 1315 ("Channels", channels_section), 1316 ("Peaks", peaks_section), 1317 ("Ladder", ladder_section), 1318 ("Areas", areas_section), 1319 ("Peak Table", dataframe_section), 1320 tabs_location="left", 1321 ) 1322 report = pn.Column( 1323 head, 1324 pn.layout.Divider(), 1325 all_tabs, 1326 ) 1327 1328 return report
def
generate_no_peaks_report(fsa):
1331def generate_no_peaks_report(fsa): 1332 channel_header = header( 1333 text="## Plot of channels", 1334 bg_color="#04c273", 1335 height=80, 1336 textalign="left", 1337 ) 1338 # PLOT 1339 channel_tab = pn.Tabs() 1340 for plot, name in plot_fsa_data(fsa): 1341 pane = pn.pane.Vega(plot.interactive(), sizing_mode="stretch_both", name=name) 1342 channel_tab.append(pane) 1343 channels_section = pn.Column(channel_header, channel_tab) 1344 1345 ### CREATE REPORT ### 1346 file_name = fsa.file_name 1347 date = fsa.fsa["RUND1"] 1348 head = header( 1349 "# No peaks could be generated. Please look at the raw data.", height=100 1350 ) 1351 1352 all_tabs = pn.Tabs( 1353 ("Channels", channels_section), 1354 tabs_location="left", 1355 ) 1356 report = pn.Column( 1357 head, 1358 pn.layout.Divider(), 1359 all_tabs, 1360 ) 1361 1362 return report
def
main( command, sub_command, fsa, output, ladder, sample_channel, min_distance_between_peaks, min_size_standard_height, custom_peaks, peak_height_sample_data, min_ratio_to_allow_peak, distance_between_assays, search_peaks_start, peak_area_model):
1365def main( 1366 command, 1367 sub_command, 1368 fsa, 1369 output, 1370 ladder, 1371 sample_channel, 1372 min_distance_between_peaks, 1373 min_size_standard_height, 1374 custom_peaks, 1375 peak_height_sample_data, 1376 min_ratio_to_allow_peak, 1377 distance_between_assays, 1378 search_peaks_start, 1379 peak_area_model, 1380): 1381 today = datetime.today().strftime('%Y-%m-%d %H:%M:%S') 1382 FAILED_FILES = [] 1383 PEAK_TABLES = [] 1384 start_time = datetime.now() 1385 print_green(f"Running fraggler {sub_command}!") 1386 1387 folder_exists(output) 1388 make_dir(output) 1389 1390 LOG_FILE = f"{output}/fraggler.log" 1391 write_log( 1392 LOG_FILE, 1393 f"Date: {datetime.today().strftime('%Y-%m-%d %H:%M:%S')}", 1394 "", 1395 "Command:", 1396 command, 1397 "---------------------------", 1398 ) 1399 1400 fsa_files = get_files(fsa) 1401 1402 for fsa in fsa_files: 1403 print_green(f" Running fraggler on {fsa}") 1404 write_log(LOG_FILE, f"Parsing {fsa}:") 1405 fsa_file = parse_fsa( 1406 fsa, 1407 ladder, 1408 sample_channel=sample_channel, 1409 min_distance_between_peaks=min_distance_between_peaks, 1410 min_size_standard_height=min_size_standard_height, 1411 ) 1412 1413 if fsa_file == None: 1414 print_fail(f"Could not parse fsa {fsa}") 1415 write_log( 1416 LOG_FILE, 1417 "Could not parse fsa", 1418 "Aborting", 1419 "", 1420 ) 1421 print_warning(f"Continuing to the next file...") 1422 print("") 1423 FAILED_FILES.append(fsa) 1424 continue 1425 1426 print_green(f"Using size standard channel: {fsa_file.size_standard_channel}") 1427 write_log(LOG_FILE, f"Size standard channel: {fsa_file.size_standard_channel}") 1428 1429 fsa = find_size_standard_peaks(fsa_file) 1430 1431 # if to few size standard peaks are found 1432 if len(fsa.size_standard_peaks) < len(fsa.ladder_steps): 1433 write_log( 1434 LOG_FILE, 1435 "To few size standard peaks found", 1436 "Aborting", 1437 "", 1438 ) 1439 print_warning("To few size standard peaks found") 1440 print_warning(f"Ladder peaks number: {len(fsa.ladder_steps)}") 1441 print_warning(f"Found size standard peaks: {len(fsa.size_standard_peaks)}") 1442 print_warning("Try changing the --min_size_standard_height") 1443 print_warning(f"Current value: {min_size_standard_height}") 1444 print_warning(f"...Or change ladder. Current ladder {ladder}") 1445 print_warning(f"Generating a report of the raw data, please have a look...") 1446 no_peaks_report = generate_no_peaks_report(fsa) 1447 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1448 print_warning(f"Continuing to the next file...") 1449 print("") 1450 FAILED_FILES.append(fsa.file_name) 1451 continue 1452 1453 1454 fsa = return_maxium_allowed_distance_between_size_standard_peaks( 1455 fsa, multiplier=2 1456 ) 1457 1458 # try to find a good number for allowed diffs between peaks 1459 for _ in range(20): 1460 fsa = generate_combinations(fsa) 1461 if fsa.best_size_standard_combinations.shape[0] > 0: 1462 break 1463 1464 fsa.maxium_allowed_distance_between_size_standard_peaks += 10 1465 1466 if fsa.best_size_standard_combinations.shape[0] == 0: 1467 write_log( 1468 LOG_FILE, 1469 "No combinations of the size standard could be made", 1470 "Aborting", 1471 "", 1472 ) 1473 print_warning("No combinations of the size standard could be made") 1474 print_warning("Try changing the --min_size_standard_height") 1475 print_warning(f"Current value: {min_size_standard_height}") 1476 print_warning(f"...Or change ladder. Current ladder {ladder}") 1477 print_warning(f"Generating a report of the raw data, please have a look...") 1478 no_peaks_report = generate_no_peaks_report(fsa) 1479 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1480 print_warning(f"Continuing to the next file...") 1481 print("") 1482 FAILED_FILES.append(fsa.file_name) 1483 continue 1484 1485 fsa = calculate_best_combination_of_size_standard_peaks(fsa) 1486 fsa = fit_size_standard_to_ladder(fsa) 1487 1488 # if no model could be fitted 1489 if not fsa.fitted_to_model: 1490 print_warning(f"No ladder model could be fitted to {fsa.file_name}...") 1491 print_warning("Try changing the --min_size_standard_height") 1492 print_warning(f"Current value: {min_size_standard_height}") 1493 write_log( 1494 LOG_FILE, 1495 "No ladder model could be fitted", 1496 "Aborting", 1497 "", 1498 ) 1499 print_warning(f"Generating a report of the raw data, please have a look...") 1500 no_peaks_report = generate_no_peaks_report(fsa) 1501 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1502 print_warning(f"Continuing to the next file...") 1503 print("") 1504 FAILED_FILES.append(fsa.file_name) 1505 continue 1506 1507 1508 if custom_peaks: 1509 print_green(f"Using custom peaks") 1510 # test if custom peaks are ok 1511 custom_peaks = read_valid_csv(custom_peaks) 1512 custom_peaks_are_overlapping(custom_peaks) 1513 custom_peaks_has_columns(custom_peaks) 1514 fsa = find_peaks_customized( 1515 fsa, 1516 custom_peaks, 1517 peak_height_sample_data=peak_height_sample_data, 1518 search_peaks_start=search_peaks_start, 1519 ) 1520 else: # find peak agnostic 1521 print_green(f"Finding peaks agnostic") 1522 fsa = find_peaks_agnostic( 1523 fsa, 1524 peak_height_sample_data=peak_height_sample_data, 1525 min_ratio=min_ratio_to_allow_peak, 1526 distance_between_assays=distance_between_assays, 1527 search_peaks_start=search_peaks_start, 1528 ) 1529 1530 # if no found peaks 1531 if fsa.found_peaks == "error": 1532 print_warning(f"No peaks could be detected for {fsa.file_name}...") 1533 print_warning("Try changing the --peak_height_sample_data") 1534 print_warning(f"Current value: {peak_height_sample_data}") 1535 write_log( 1536 LOG_FILE, 1537 "No peaks in sampel data channel could be identified", 1538 "Aborting", 1539 "", 1540 ) 1541 print_warning(f"Generating a report of the raw data, please have a look...") 1542 no_peaks_report = generate_no_peaks_report(fsa) 1543 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1544 print_warning(f"Continuing to the next file...") 1545 print("") 1546 FAILED_FILES.append(fsa.file_name) 1547 continue 1548 1549 print_blue(f"Found {fsa.identified_sample_data_peaks.assay.nunique()} assays") 1550 print_blue(f"Found {fsa.identified_sample_data_peaks.shape[0]} peaks") 1551 write_log( 1552 LOG_FILE, 1553 f"Found {fsa.identified_sample_data_peaks.assay.nunique()} assays", 1554 f"Found {fsa.identified_sample_data_peaks.shape[0]} peaks", 1555 ) 1556 1557 if sub_command == "peak": 1558 # save csv 1559 peak_table = ( 1560 fsa.identified_sample_data_peaks 1561 .assign(file_name=fsa.file_name) 1562 [["basepairs", "assay", "peak_name", "file_name"]] 1563 ) 1564 1565 PEAK_TABLES.append(peak_table) 1566 1567 # create peak report 1568 print_green("Creating peak report...") 1569 report = generate_peak_report(fsa) 1570 report.save(f"{output}/{fsa.file_name}_fraggler_report.html") 1571 1572 print_green(f"Fraggler done for {fsa.file_name}") 1573 print("") 1574 write_log(LOG_FILE, "") 1575 1576 if sub_command == "area": 1577 fsa = find_peak_widths(fsa) 1578 fsa = find_peaks_with_padding(fsa) 1579 fsa = fit_lmfit_model_to_area(fsa, peak_area_model) 1580 1581 if fsa.fitted_area_peaks.shape[0] == 0: 1582 print_warning(f"No areas could be fiitted for {fsa.file_name}...") 1583 write_log( 1584 LOG_FILE, 1585 "No areas could be fitted", 1586 "Aborting", 1587 "", 1588 ) 1589 print_warning(f"Generating a report of the raw data, please have a look...") 1590 no_peaks_report = generate_no_peaks_report(fsa) 1591 no_peaks_report.save(f"{output}/{fsa.file_name}_fraggler_fail.html") 1592 print_warning(f"Continuing to the next file...") 1593 print("") 1594 FAILED_FILES.append(fsa.file_name) 1595 continue 1596 1597 1598 fsa = calculate_quotients(fsa) 1599 fsa = update_identified_sample_data_peaks(fsa) 1600 1601 # add peaks table to list and concat later 1602 peak_table = ( 1603 fsa.identified_sample_data_peaks[ 1604 ["basepairs", "assay", "peak_name", "model", "r_value", "quotient"] 1605 ].assign(file_name=fsa.file_name) 1606 ) 1607 PEAK_TABLES.append(peak_table) 1608 1609 # create report 1610 print_green("Creating area report...") 1611 report = generate_area_report(fsa) 1612 report.save(f"{output}/{fsa.file_name}_fraggler_report.html") 1613 1614 print_green(f"Fraggler done for {fsa.file_name}") 1615 print("") 1616 write_log(LOG_FILE, "") 1617 1618 # save csv file in peak report 1619 if len(PEAK_TABLES) > 0: 1620 pd.concat(PEAK_TABLES).to_csv( 1621 f"{output}/fraggler_peaks.csv", 1622 index=False 1623 ) 1624 1625 print_green("Fraggler done!") 1626 1627 if len(FAILED_FILES) > 0: 1628 print_warning("Following files failed:") 1629 write_log(LOG_FILE, "Following files failed:") 1630 write_log(LOG_FILE, *FAILED_FILES) 1631 for file in FAILED_FILES: 1632 print_warning(f" {file}") 1633 1634 end_time = datetime.now() 1635 time_diff = end_time - start_time 1636 write_log( 1637 LOG_FILE, 1638 "---------------------------", 1639 f"Time runned : {time_diff}", 1640 "---------------------------" 1641 )