```{toctree} :hidden: True self environments simpy seeds ``` # General Python Stuff ## Using VS Code To open VS Code from terminal: `code .` To activate conda environment in VS Code: Ctrl+Shift+P > Python Interpreter, then select correct environment To view/create settings.json in .vscode folder: Ctrl+Shift+P > Preferences: Open Workspace Settings (JSON) ## Type hinting in VS Code with Pylance Extension > Pylance should be installed In settings.json, set `"python.analysis.typeCheckingMode": "strict"` or to `"None"` or `"basic"` ## Violin plot ``` df = data.groupby('quarter_year')['stroke_severity'].agg(lambda x: list(x)) fig, ax = plt.subplots() ax.violinplot(df) ax.set_xticks(np.arange(1, len(df.index)+1), labels=df.index) plt.show() ``` ## Jupyter lab Install jupyterlab package. Where you want to open, in terminal, type `jupyter lab` ## Comparing two dataframes Simple check if match: `df1.equals(df2)` Drop duplicate columns: `pd.concat([df1, df2]).drop_duplicates(keep=False)` ## Linting Install flake8 and nbqa packages. To lint file: `flake8 filename.py` To lint jupyter notebook: `nbqa flake8 filename.ipynb` To lint within notebook: `%load_ext pycodestyle_magic` and `%pycodestyle_on` ## Compare dataframes True/False of difference (note: sometimes False but no actual difference - then try assertion below) ``` df1.equals(df2) ``` Show differences between them ``` df1.compare(df2) ``` Get feedback on exactly what is missing ``` from pandas.testing import assert_frame_equal assert_frame_equal(df1, df2) ``` ## Package https://betterscientificsoftware.github.io/python-for-hpc/tutorials/python-pypi-packaging/ * Make setup.py - if you read from requirements for it, will need to make MANIFEST.in file which includes the line `include requirements.txt` * `python setup.py check` * `python setup.py sdist bdist_wheel` * `pip install twine` * `twine upload --repository-url https://test.pypi.org/legacy/ dist/*` * When want to update package: * Delete dist folder * `python setup.py sdist bdist_wheel` * `twine upload --skip-existing --repository-url https://test.pypi.org/legacy/ dist/*` * Above uses test pypi URL - for real pypi upload, use `https://upload.pypi.org/legacy/` * To install the package locally (rather than from pypi) using your requirements.txt file, if for example it was in a sister directory, add to the text file `../foldername/dist/filename.whl` * If you are simultaneously working on the package and other code - so if you want to be getting live changes to the package rather than having to rebuild and reinstall the package each time - use `pip install -e /path/to/repo/` - this will use a symbolic link to the repository meaning any changes to the code will also be automatically reflected. For example, when its in a sister directory, `pip install -e ../kailo_beewell_dashboard_package`. You can also do this from your **requirements.txt** by adding `-e ../packagefolder`. If the package was already in your environment, you'll need to delete it, and make just delete and remake the environment ## Unit testing Note: unit testing is typically for testing function outputs - so although the code below works, you wouldn't typically write unit tests for this purpose. Example: ``` import numpy as np import os import pandas as pd import unittest class DataTests(unittest.TestCase): '''Class for running unit tests''' # Run set up once for whole class @classmethod def setUpClass(self): '''Set up - runs prior to each test''' # Paths and filenames raw_path: str = './data' raw_filename: str = 'SAMueL ssnap extract v2.csv' clean_path: str = './output' clean_filename: str = 'reformatted_data.csv' # Import dataframes raw_data = pd.read_csv(os.path.join(raw_path, raw_filename), low_memory=False) clean_data = pd.read_csv(os.path.join(clean_path, clean_filename), low_memory=False) # Save to DataTests class self.raw = raw_data self.clean = clean_data def freq(self, raw_col, raw_val, clean_col, clean_val): ''' Test that the frequency of a value in the raw data is same as the frequency of a value in the cleaned data Inputs: - self - raw_col and clean_col = string - raw_val and clean_val = string, number, or list Performs assertEqual test. ''' # If values are not lists, convert to lists if type(raw_val) != list: raw_val = [raw_val] if type(clean_val) != list: clean_val = [clean_val] # Find frequencies and check if equal raw_freq = (self.raw[raw_col].isin(raw_val).values).sum() clean_freq = (self.clean[clean_col].isin(clean_val).values).sum() self.assertEqual(raw_freq, clean_freq) def time_neg(self, time_column): ''' Function for testing that times are not negative when expected to be positive. Input: time_column = string, column with times ''' self.assertEqual(sum(self.clean[time_column] < 0), 0) def equal_array(self, df, col, exp_array): ''' Function to check that the only possible values in a column are those provided by exp_array. Inputs: - df = dataframe (raw or clean) - col = string (column name) - exp_array = array (expected values for column) ''' # Sorted so that array order does not matter self.assertEqual(sorted(df[col].unique()), sorted(exp_array)) def test_raw_shape(self): '''Test the raw dataframe shape is as expected''' self.assertEqual(self.raw.shape, (360381, 83)) def test_id(self): '''Test that ID numbers are all unique''' self.assertEqual(len(self.clean.id.unique()), len(self.clean.index)) def test_onset(self): '''Test that onset_known is equal to precise + best estimate''' self.freq('S1OnsetTimeType', ['P', 'BE'], 'onset_known', 1) self.freq('S1OnsetTimeType', 'NK', 'onset_known', 0) def test_time_negative(self): '''Test that times are not negative when expected to be positive''' time_col = ['onset_to_arrival_time', 'call_to_ambulance_arrival_time', 'ambulance_on_scene_time', 'ambulance_travel_to_hospital_time', 'ambulance_wait_time_at_hospital', 'scan_to_thrombolysis_time', 'arrival_to_thrombectomy_time'] for col in time_col: with self.subTest(msg=col): self.time_neg(col) def test_no_ambulance(self): ''' Test that people who do not arrive by ambulance therefore have no ambulance times ''' amb_neg = self.clean[(self.clean['arrive_by_ambulance'] == 0) & ( (self.clean['call_to_ambulance_arrival_time'].notnull()) | (self.clean['ambulance_on_scene_time'].notnull()) | (self.clean['ambulance_travel_to_hospital_time'].notnull()) | (self.clean['ambulance_wait_time_at_hospital'].notnull()))] self.assertEqual(len(amb_neg.index), 0) if __name__ == '__main__': unittest.main() ``` ## Colour scales ``` ''' Helper functions for creating discrete colour scale sampling in the continuous scale between two provided colours. Source: https://bsouthga.dev/posts/color-gradients-with-python Directly copied from that webpage, all credit to them. ''' def hex_to_RGB(hex): ''' "#FFFFFF" -> [255,255,255] ''' # Pass 16 to the integer function for change of base return [int(hex[i:i+2], 16) for i in range(1,6,2)] def RGB_to_hex(RGB): ''' [255,255,255] -> "#FFFFFF" ''' # Components need to be integers for hex to make sense RGB = [int(x) for x in RGB] return "#"+"".join(["0{0:x}".format(v) if v < 16 else "{0:x}".format(v) for v in RGB]) def color_dict(gradient): ''' Takes in a list of RGB sub-lists and returns dictionary of colors in RGB and hex form for use in a graphing function defined later on ''' return {"hex":[RGB_to_hex(RGB) for RGB in gradient], "r":[RGB[0] for RGB in gradient], "g":[RGB[1] for RGB in gradient], "b":[RGB[2] for RGB in gradient]} def linear_gradient(start_hex, finish_hex="#FFFFFF", n=10): ''' returns a gradient list of (n) colors between two hex colors. start_hex and finish_hex should be the full six-digit color string, inlcuding the number sign ("#FFFFFF") ''' # Starting and ending colors in RGB form s = hex_to_RGB(start_hex) f = hex_to_RGB(finish_hex) # Initilize a list of the output colors with the starting color RGB_list = [s] # Calcuate a color at each evenly spaced value of t from 1 to n for t in range(1, n): # Interpolate RGB vector for color at the current value of t curr_vector = [ int(s[j] + (float(t)/(n-1))*(f[j]-s[j])) for j in range(3) ] # Add it to our list of output colors RGB_list.append(curr_vector) return color_dict(RGB_list) ``` ### Save HTML to csv and then import again and convert to PDF (this is not ideal tbh) ``` import weasyprint import csv with open("out.csv", "w", encoding="utf-8") as csv_file: writer = csv.writer(csv_file) writer.writerow([html_content]) with open("out.csv", "r") as csv_file: csv_text = csv_file.readlines() html_check = ''.join(csv_text)[1:-1] weasyprint.HTML(string=html_check).write_pdf('report/report.pdf') ```