# The first step when using python is to import the packages which you want to use.
# is is good practice to import all the packages you want to use in one place (as opposed to throught the file)

# Packages are bits of codes other have written and you have installed which you can call from your programs
# this is super helpful as it means you don't have to write all this code again

# Some particularly helpful packages are:
#   - numpy for more powerful mathematics operations such as linear algebra and statistics
#   - matplotlib for graphing and visualizaton
#   - scipy for some generally helpful scientific tools such as curve fitting
#   - pandas for big data handeling
#   - astropy for tools specific to astornomy such as time corrections and sky coordinates

# The syntax to bring packages into python can be somewhat confusing, I'll try to break down the basic rules below
# import statments can either start with "import" or "from"
# statements are basically human readable
# the "as" keyword is like assigning a shortcut name to a thing

# so for example you might say
# import numpy as np
# this will bring in all the code from the numpy package to your script. You can then access the code by writing np.<function_name> (for example np.sin(2*np.pi) would return the sin of 2 pi)
# if you didn't have as and instead just wrote
# import numpy
# you would instead write numpy.sin(2*numpy.py)

# you can also import specific things from a package
# from numpy import pi, sin
# in this case you can simply write sin(2*pi) but you wont have access to any of the other things within the package (np.<function_name> won't work because you never defined what np is)

# sometimes packages have subpackages within them
# import numpy.random as rand
# here the subpackage is called random, you can then access anything within random by doing rand.<function_name> (such as rand.normal() for a normal distribution)

# similarly to above you could also just grab one part
# from numpy.random import normal as norm
# this would then let you simply call norm()

# note how you can combine / mix and match the import, from, and as keywords

# Below I will import a few important packages in the way that I most commonly use them (including their most common / standard abbriviations)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.stats import gaussian_kde
from matplotlib.colors import LogNorm


FILEPATH = "hlsp_acsggct_hst_acs-wfc_ngc2808_r.rdviq.cal.adj.zpt"


# As a first check lets run a line of code that will make sure the file exists where we think it does!
# If the file exists then this will output okay, otherwise it will raise an error
# don't worry about the details of how the error is raised right now
if not os.path.exists(FILEPATH):
    raise OSError(f"file {os.path.basename(FILEPATH)} does not exist!")
else:
    print("Okay!")

Okay!


file = open(FILEPATH)
file_contents = file.read()
file.close()

# because the file is really large it would take a long time to print all of it so this just prints the first 100 charectars to the screen, just to make sure this worked
print(file_contents[:100])

id x y Vvega err VIvega err Ivega err Vground Iground Nv  Ni wV wI xsig ysig othv othi qfitV qfitI R


with open(FILEPATH) as file:
    file_contents = file.read()
    
print(file_contents[:100])

id x y Vvega err VIvega err Ivega err Vground Iground Nv  Ni wV wI xsig ysig othv othi qfitV qfitI R


# split file_contents into an array of strings based on lines. Each line becomes a new entry into this array. the \n charectar is the UNIX new line charectar.
# This will work on any "UNIX-Like" system. This includes Mac OS and Linux. This does not always work on windows which sometimes uses a different line feed charectar.
lines = file_contents.split('\n')

# get the first (0th because python counts from 0) line, which we already know is the header line
header_row = lines[0]

# We now have the first row but really we want that to be a list of the column names. We can use the split() function to split a string into a list of strings based on whitespace
# every white space charectar will be removed and the strings between them will be added to a new list (in this case called header)
header = header_row.split()
print(header)

['id', 'x', 'y', 'Vvega', 'err', 'VIvega', 'err', 'Ivega', 'err', 'Vground', 'Iground', 'Nv', 'Ni', 'wV', 'wI', 'xsig', 'ysig', 'othv', 'othi', 'qfitV', 'qfitI', 'RA', 'Dec']


# Here we get all the lines after (but not including) the third line up do (but not including) the final line (thats what the n:m sytnax does, start at n non inclusive and get everything after it up until m)
# negative numbers count back from the end so -1 is the last line
# we want to drop the last line because if you open the file and look at it you will see the last line is blank with no data on it
data_list = lines[3:-1]

# data_list is now a list of lines, we want to split each list into another list of numbers. There are a few ways you could do this;
# however, one of the best is with something called "list-comprehension"

# This lets you iterate over elements of a list and preform an operation on each one one at a time
# So below we say that for each row (x) in data_list (each row in the file) split that row in the same way we split
# the column names from before. We will be left with a 2D list (a table)
data_seperated = [x.split() for x in data_list]

# One major problem remains however. The data in data_seperated are in the form of strings
# that is to say that while they look like numbers, the computer knows them only as the charectars that represent those
# numbers. It does not know they are actually numbers and can therefore not do any math with them.

# Most languages (Python included) have whats called "casting" where you can take the string representation of a number
# and turn it into the numeric representation (tell the computer its a number and not just a set of charectars)
# next we will cast all the elements of data_seperated to "floats" (decimal numbers)

# recall that data_seperated is a 2D list so we need to loop over 2 indicies. Once again list comprehension makes
# this quite straigtforward

# Here we say for every item (y) in every row (x) in data_seperated turn y into a float
data_cast = [[float(y) for y in x] for x in data_seperated]


# This is a lot like list comprehension, this is dictoinary comprehension (note the {} as opposed to []). 
# Dictionaries are key-value paried data-structures, the key is defined on the left side of the :
# The value assigned to that key is on the right side of the :

# Here we define a dictionary called parsed
# We first loop over the "enumeration" of header
# this means loop over every element of header (recall these will be the column names) but don't just return the value, also count up from 0 (in other words return the place that value is in the list).
# The enumeration returns the index then the value, so when we say for column_number, column_name in enumerate(header) each loop iteration the index will
# unpack into column_numebr and the value at that index will unpack into column_name
# we use the column name to define the key for that element in the dict (the left side of the :) and we use the column number to select the correct element from each row
# in the data table we generated in the previous cell
parsed = {column_name: [row[column_number] for row in data_cast] for column_number, column_name in enumerate(header)}


x = parsed['x'][:100]
y = parsed['y'][:100]

plt.plot(x, y, '.')

[<matplotlib.lines.Line2D at 0x7fc5f81a73c8>]


# All of what we did before (and a good bit more behind the scenes housekeeping) is handeled in this one line
# Read_csv is the function to read an ascii file
# start by giving it the filepath
# then we need to tell it how the file is delimited. By default it will assume commas seperate the data
# here however data is seperated by spaces. We can use whats called a regular expression to generally
# to account for seperation by all whitespace (spaces, tabs, a few other esotaric charectars)
# finally we tell pandas to skip blank lines because we know we have 2 blank lines between the header and the data
parsed = pd.read_csv(FILEPATH, sep=r"\s+", skip_blank_lines=True)


# one of the nice things pandas does is give us this easy to read table output when we look at our data
parsed


# And this looks the same as before which is as it should be!
plt.plot(parsed['x'].iloc[:100], parsed['y'].iloc[:100], '.')

[<matplotlib.lines.Line2D at 0x7fc670f5d6a0>]


# Extraxt these from the parsed dataframe, this is not strictly nessisairy; however, often in programming you do things to keep code clean even if it not needed.
# by extracting these here I reduce the number of times I need to write parsed['Vvega/Ivega'] down the line. Simply writing Vvega/Ivega is a bit cleaner and
# will make the code easeier to read down the line.
Vvega = parsed['Vvega']
Ivega = parsed['Ivega']


# Let's plot these now


# '.' tells matplotlib to plot small points (as opposed to 'o' with is large points, or not having anything there in which case it will attempt to connect all points with lines)
plt.plot(Vvega-Ivega, Vvega, '.')
plt.xlabel('F606W-F814W')
plt.ylabel('F606W')

Text(0, 0.5, 'F606W')


# Lets take a look at this in more detail

# subplots let you make multiple plots in one line, here we only make one so we ask for 1 row and 1 column (thats what the 1, 1 is for)
# the figure (which holds the axes) and the axes (which you plot to) are both returned from this
# finally we set the figure size to 10, 7. These size units are basically arbitray so find ones that work for you
fig, ax = plt.subplots(1, 1, figsize=(10, 7))

ax.plot(Vvega-Ivega, Vvega, '.')
ax.set_xlabel('F606W-F814W')
ax.set_ylabel('F606W')

Text(0, 0.5, 'F606W')


fig, ax = plt.subplots(1, 1, figsize=(10, 7))

ax.plot(Vvega-Ivega, Vvega, '.')

# There are many helpful functions within the axes to customize them
ax.axes.invert_yaxis()

ax.set_xlabel('F606W-F814W', fontsize=25)
ax.set_ylabel('F606W', fontsize=25)

Text(0, 0.5, 'F606W')


color = Vvega-Ivega

# here we use guassian_kde to create a function we can evaluate at any point in our domain and return the estimate density at that point
# therefore kde_f is a function which takes a coordinate as argument (such as kde_f((0,0)) and returns a density estimate
xy = np.vstack([color, Vvega])
kde_f = gaussian_kde(xy)

# We could evaluate the density at every point we have data for
# However....Thats a lot of points and will take a long time to run
# A faster way is to build up a "mesh". This is a grid of points covering some 
# parameter space. Here we will build a mesh covering the color and Vvega range from each of their min values to each of their max values.

# We do some numpy magic, dont worry about this for now, just know for future that stuff like this is sometimes needed
X, Y = np.mgrid[color.min():color.max():100j, Vvega.min():Vvega.max():100j]
positions = np.vstack([X.ravel(), Y.ravel()])

# We then evaluate the kde_f function and reshape it to be an "image" (2D data / matrix)
Z = np.reshape(kde_f(positions).T, X.shape)


# then we plot that image similar to how we did before. Note how for a 2D data set we use the function imshow
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
ax.invert_yaxis()

# extent here tells matplotlib what numeric range the image data runs over
ax.imshow(np.rot90(Z), extent=[color.min(), color.max(), Vvega.min(), Vvega.max()], norm=LogNorm())

<matplotlib.image.AxesImage at 0x7fc5f016a9e8>


fig, ax = plt.subplots(1, 1, figsize=(10, 7))
img = ax.imshow(np.rot90(Z/Z.max()), extent=[color.min(), color.max(), Vvega.min(), Vvega.max()], norm=LogNorm())
ax.set_xlabel("F606W-F814W", fontsize=25)
ax.set_ylabel("F606W", fontsize=25)

# bbox_inches=tight tells matplotlib to fit the file size to the figure size, without it you will end up with a lot of whitespace in your image surrounding your file
fig.savefig("NGC2808_principal_exampleCMD.pdf", bbox_inches="tight")

	id	x	y	Vvega	err	VIvega	err.1	Ivega	err.2	Vground	...	wV	wI	xsig	ysig	othv	othi	qfitV	qfitI	RA	Dec
0	1	1500.801	975.634	19.905	0.0029	0.634	0.0043	19.271	0.0032	20.077	...	1	1	0.014	0.010	0.000	0.000	0.059	0.024	138.059217	-64.891138
1	2	1532.085	861.844	21.514	0.0061	0.917	0.0085	20.597	0.0059	21.783	...	1	1	0.022	0.001	0.121	0.009	0.067	0.152	138.058196	-64.892718
2	3	1530.230	872.678	19.653	0.0026	0.600	0.0039	19.053	0.0029	19.811	...	1	1	0.016	0.009	0.002	0.002	0.054	0.027	138.058256	-64.892568
3	4	1523.191	888.185	23.043	0.0125	1.028	0.0170	22.015	0.0115	23.347	...	1	1	0.056	0.040	0.000	0.000	0.116	0.075	138.058486	-64.892352
4	5	1527.557	920.728	22.079	0.0080	0.853	0.0113	21.226	0.0080	22.328	...	1	1	0.005	0.001	0.000	0.000	0.072	0.048	138.058342	-64.891901
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
308728	308729	4442.749	5098.706	25.143	0.0340	1.483	0.0423	23.660	0.0252	25.567	...	1	1	0.014	0.031	0.002	0.001	0.183	0.167	137.963025	-64.833873
308729	308730	4471.442	5010.146	22.015	0.0079	0.864	0.0111	21.151	0.0078	22.268	...	1	1	0.009	0.001	0.000	0.000	0.065	0.034	137.962086	-64.835103
308730	308731	4484.483	4999.601	21.080	0.0051	0.765	0.0074	20.315	0.0053	21.299	...	1	1	0.010	0.002	0.000	0.000	0.042	0.029	137.961660	-64.835249
308731	308732	4464.859	5026.023	22.737	0.0110	1.009	0.0150	21.728	0.0102	23.035	...	1	1	0.008	0.007	0.146	0.115	0.075	0.045	137.962302	-64.834883
308732	308733	4470.596	5047.705	25.217	0.0355	1.328	0.0453	23.889	0.0282	25.603	...	1	1	0.040	0.079	0.000	0.000	0.228	0.110	137.962115	-64.834581

Data Handeling and Visualization Pt. 1¶

Reading in Files¶

Manually¶

With Pandas¶

Plotting¶