Materialized Views¶
Materialized Views in Synapse allow you to create queryable views that store the results of a Synapse SQL statement. These views are useful for combining, filtering, or transforming data from multiple tables into a single, queryable entity.
This tutorial will walk you through the basics of working with Materialized Views using the Synapse Python client.
Tutorial Purpose¶
In this tutorial, you will:
- Log in, get your project, and create tables with data
- Create and query a Materialized View
- Create and query a Materialized View with a JOIN clause
- Create and query a Materialized View with a LEFT JOIN clause
- Create and query a Materialized View with a RIGHT JOIN clause
- Create and query a Materialized View with a UNION clause
Prerequisites¶
- This tutorial assumes that you have a Synapse project.
- Pandas must also be installed as shown in the installation documentation.
1. Log in, get your project, and create tables with data¶
Before creating Materialized Views, we need to log in to Synapse, retrieve your project, and create the tables with data that will be used in the views.
You will want to replace "My uniquely named project about Alzheimer's Disease"
with
the name of your project.
import pandas as pd
from synapseclient import Synapse
from synapseclient.models import Column, ColumnType, MaterializedView, Project, Table
# Initialize Synapse client
syn = Synapse()
syn.login()
# Get the project where we want to create the materialized view
project = Project(name="My uniquely named project about Alzheimer's Disease").get()
project_id = project.id
print(f"Got project with ID: {project_id}")
# Create the first table with some columns and rows
table1_columns = [
Column(name="sample_id", column_type=ColumnType.STRING),
Column(name="patient_id", column_type=ColumnType.STRING),
Column(name="age", column_type=ColumnType.INTEGER),
Column(name="diagnosis", column_type=ColumnType.STRING),
]
table1 = Table(
name="Patient Demographics",
parent_id=project_id,
columns=table1_columns,
)
table1 = table1.store()
print(f"Created table 1 with ID: {table1.id}")
# Add rows to the first table
data1 = pd.DataFrame(
[
{"sample_id": "S1", "patient_id": "P1", "age": 70, "diagnosis": "Alzheimer's"},
{"sample_id": "S2", "patient_id": "P2", "age": 65, "diagnosis": "Healthy"},
{"sample_id": "S3", "patient_id": "P3", "age": 72, "diagnosis": "Alzheimer's"},
{"sample_id": "S4", "patient_id": "P4", "age": 68, "diagnosis": "Healthy"},
{"sample_id": "S5", "patient_id": "P5", "age": 75, "diagnosis": "Alzheimer's"},
{"sample_id": "S6", "patient_id": "P6", "age": 80, "diagnosis": "Healthy"},
]
)
table1.upsert_rows(values=data1, primary_keys=["sample_id"])
# Create the second table with some columns and rows
table2_columns = [
Column(name="sample_id", column_type=ColumnType.STRING),
Column(name="gene", column_type=ColumnType.STRING),
Column(name="expression_level", column_type=ColumnType.DOUBLE),
]
table2 = Table(
name="Gene Expression Data",
parent_id=project_id,
columns=table2_columns,
)
table2 = table2.store()
print(f"Created table 2 with ID: {table2.id}")
# Add rows to the second table
data2 = pd.DataFrame(
[
{"sample_id": "S1", "gene": "APOE", "expression_level": 2.5},
{"sample_id": "S2", "gene": "APP", "expression_level": 1.8},
{"sample_id": "S3", "gene": "PSEN1", "expression_level": 3.2},
{"sample_id": "S4", "gene": "MAPT", "expression_level": 2.1},
{"sample_id": "S5", "gene": "APP", "expression_level": 3.5},
{"sample_id": "S7", "gene": "PSEN2", "expression_level": 1.9},
]
)
table2.upsert_rows(values=data2, primary_keys=["sample_id"])
2. Create and query a Materialized View¶
First, we will create a simple Materialized View that selects all rows from a table and then query it to retrieve the results.
def create_materialized_view():
"""
Example: Create a new materialized view with a defining SQL query.
"""
materialized_view = MaterializedView(
name="Patient Data View",
description="A view combining patient demographics and gene expression data",
parent_id=project_id,
defining_sql=f"SELECT * FROM {table1.id}",
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view:")
print(query_result)
The result of querying your Materialized View should look like:
Results from the materialized view:
sample_id patient_id age diagnosis
0 S1 P1 70 Alzheimer's
1 S2 P2 65 Healthy
2 S3 P3 72 Alzheimer's
3 S4 P4 68 Healthy
4 S5 P5 75 Alzheimer's
5 S6 P6 80 Healthy
3. Create and query a Materialized View with a JOIN clause¶
Next, we will create a Materialized View that combines data from two tables using a JOIN clause and then query it to retrieve the results.
def create_materialized_view_with_join():
"""
Example: Create a materialized view with a JOIN clause.
"""
defining_sql = f"""
SELECT t1.sample_id AS sample_id, t1.patient_id AS patient_id, t1.age AS age, t1.diagnosis AS diagnosis,
t2.gene AS gene, t2.expression_level AS expression_level
FROM {table1.id} t1
JOIN {table2.id} t2
ON t1.sample_id = t2.sample_id
"""
materialized_view = MaterializedView(
name="Joined Patient Data View",
description="A materialized view joining patient demographics with gene expression data",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with JOIN:")
print(query_result)
The result of querying your Materialized View with a JOIN clause should look like:
Results from the materialized view with JOIN:
sample_id patient_id age diagnosis gene expression_level
0 S1 P1 70 Alzheimer's APOE 2.5
1 S2 P2 65 Healthy APP 1.8
2 S3 P3 72 Alzheimer's PSEN1 3.2
3 S4 P4 68 Healthy MAPT 2.1
4 S5 P5 75 Alzheimer's APP 3.5
4. Create and query a Materialized View with a LEFT JOIN clause¶
We can also create a Materialized View that includes all rows from one table and matches rows from another table using a LEFT JOIN clause and then query it to retrieve the results.
def create_materialized_view_with_left_join():
"""
Example: Create a materialized view with a LEFT JOIN clause.
"""
defining_sql = f"""
SELECT t1.sample_id AS sample_id, t1.patient_id AS patient_id, t1.age AS age, t1.diagnosis AS diagnosis,
t2.gene AS gene, t2.expression_level AS expression_level
FROM {table1.id} t1
LEFT JOIN {table2.id} t2
ON t1.sample_id = t2.sample_id
"""
materialized_view = MaterializedView(
name="Left Joined Patient Data View",
description="A materialized view with a LEFT JOIN clause, including all patients even if they lack gene expression data",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with LEFT JOIN:")
print(query_result)
The result of querying your Materialized View with a LEFT JOIN clause should look like:
Results from the materialized view with LEFT JOIN:
sample_id patient_id age diagnosis gene expression_level
0 S1 P1 70 Alzheimer's APOE 2.5
1 S2 P2 65 Healthy APP 1.8
2 S3 P3 72 Alzheimer's PSEN1 3.2
3 S4 P4 68 Healthy MAPT 2.1
4 S5 P5 75 Alzheimer's APP 3.5
5 S6 P6 80 Healthy NaN NaN
5. Create and query a Materialized View with a RIGHT JOIN clause¶
Similarly, we can create a Materialized View that includes all rows from one table and matches rows from another table using a RIGHT JOIN clause and then query it to retrieve the results.
def create_materialized_view_with_right_join():
"""
Example: Create a materialized view with a RIGHT JOIN clause.
"""
defining_sql = f"""
SELECT t2.sample_id AS sample_id, t1.patient_id AS patient_id, t1.age AS age, t1.diagnosis AS diagnosis,
t2.gene AS gene, t2.expression_level AS expression_level
FROM {table1.id} t1
RIGHT JOIN {table2.id} t2
ON t1.sample_id = t2.sample_id
"""
materialized_view = MaterializedView(
name="Right Joined Patient Data View",
description="A materialized view with a RIGHT JOIN clause, including all gene expression data even if no patient matches",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with RIGHT JOIN:")
print(query_result)
The result of querying your Materialized View with a RIGHT JOIN clause should look like:
Results from the materialized view with RIGHT JOIN:
sample_id patient_id age diagnosis gene expression_level
0 S1 P1 70.0 Alzheimer's APOE 2.5
1 S2 P2 65.0 Healthy APP 1.8
2 S3 P3 72.0 Alzheimer's PSEN1 3.2
3 S4 P4 68.0 Healthy MAPT 2.1
4 S5 P5 75.0 Alzheimer's APP 3.5
5 S7 NaN NaN NaN PSEN2 1.9
6. Create and query a Materialized View with a UNION clause¶
Finally, we can create a Materialized View that combines rows from two tables using a UNION clause and then query it to retrieve the results.
def create_materialized_view_with_union():
"""
Example: Create a materialized view with a UNION clause.
"""
defining_sql = f"""
SELECT t1.sample_id AS sample_id
FROM {table1.id} t1
UNION
SELECT t2.sample_id AS sample_id
FROM {table2.id} t2
"""
materialized_view = MaterializedView(
name="Union Patient Data View",
description="A materialized view with a UNION clause",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with UNION:")
print(query_result)
The result of querying your Materialized View with a UNION clause should look like:
Results from the materialized view with UNION:
sample_id
0 S1
1 S2
2 S3
3 S4
4 S5
5 S6
6 S7
Source Code for this Tutorial¶
Click to show me
"""Here is where you'll find the code for the MaterializedView tutorial."""
import pandas as pd
from synapseclient import Synapse
from synapseclient.models import Column, ColumnType, MaterializedView, Project, Table
# Initialize Synapse client
syn = Synapse()
syn.login()
# Get the project where we want to create the materialized view
project = Project(name="My uniquely named project about Alzheimer's Disease").get()
project_id = project.id
print(f"Got project with ID: {project_id}")
# Create the first table with some columns and rows
table1_columns = [
Column(name="sample_id", column_type=ColumnType.STRING),
Column(name="patient_id", column_type=ColumnType.STRING),
Column(name="age", column_type=ColumnType.INTEGER),
Column(name="diagnosis", column_type=ColumnType.STRING),
]
table1 = Table(
name="Patient Demographics",
parent_id=project_id,
columns=table1_columns,
)
table1 = table1.store()
print(f"Created table 1 with ID: {table1.id}")
# Add rows to the first table
data1 = pd.DataFrame(
[
{"sample_id": "S1", "patient_id": "P1", "age": 70, "diagnosis": "Alzheimer's"},
{"sample_id": "S2", "patient_id": "P2", "age": 65, "diagnosis": "Healthy"},
{"sample_id": "S3", "patient_id": "P3", "age": 72, "diagnosis": "Alzheimer's"},
{"sample_id": "S4", "patient_id": "P4", "age": 68, "diagnosis": "Healthy"},
{"sample_id": "S5", "patient_id": "P5", "age": 75, "diagnosis": "Alzheimer's"},
{"sample_id": "S6", "patient_id": "P6", "age": 80, "diagnosis": "Healthy"},
]
)
table1.upsert_rows(values=data1, primary_keys=["sample_id"])
# Create the second table with some columns and rows
table2_columns = [
Column(name="sample_id", column_type=ColumnType.STRING),
Column(name="gene", column_type=ColumnType.STRING),
Column(name="expression_level", column_type=ColumnType.DOUBLE),
]
table2 = Table(
name="Gene Expression Data",
parent_id=project_id,
columns=table2_columns,
)
table2 = table2.store()
print(f"Created table 2 with ID: {table2.id}")
# Add rows to the second table
data2 = pd.DataFrame(
[
{"sample_id": "S1", "gene": "APOE", "expression_level": 2.5},
{"sample_id": "S2", "gene": "APP", "expression_level": 1.8},
{"sample_id": "S3", "gene": "PSEN1", "expression_level": 3.2},
{"sample_id": "S4", "gene": "MAPT", "expression_level": 2.1},
{"sample_id": "S5", "gene": "APP", "expression_level": 3.5},
{"sample_id": "S7", "gene": "PSEN2", "expression_level": 1.9},
]
)
table2.upsert_rows(values=data2, primary_keys=["sample_id"])
def create_materialized_view():
"""
Example: Create a new materialized view with a defining SQL query.
"""
materialized_view = MaterializedView(
name="Patient Data View",
description="A view combining patient demographics and gene expression data",
parent_id=project_id,
defining_sql=f"SELECT * FROM {table1.id}",
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view:")
print(query_result)
def create_materialized_view_with_join():
"""
Example: Create a materialized view with a JOIN clause.
"""
defining_sql = f"""
SELECT t1.sample_id AS sample_id, t1.patient_id AS patient_id, t1.age AS age, t1.diagnosis AS diagnosis,
t2.gene AS gene, t2.expression_level AS expression_level
FROM {table1.id} t1
JOIN {table2.id} t2
ON t1.sample_id = t2.sample_id
"""
materialized_view = MaterializedView(
name="Joined Patient Data View",
description="A materialized view joining patient demographics with gene expression data",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with JOIN:")
print(query_result)
def create_materialized_view_with_left_join():
"""
Example: Create a materialized view with a LEFT JOIN clause.
"""
defining_sql = f"""
SELECT t1.sample_id AS sample_id, t1.patient_id AS patient_id, t1.age AS age, t1.diagnosis AS diagnosis,
t2.gene AS gene, t2.expression_level AS expression_level
FROM {table1.id} t1
LEFT JOIN {table2.id} t2
ON t1.sample_id = t2.sample_id
"""
materialized_view = MaterializedView(
name="Left Joined Patient Data View",
description="A materialized view with a LEFT JOIN clause, including all patients even if they lack gene expression data",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with LEFT JOIN:")
print(query_result)
def create_materialized_view_with_right_join():
"""
Example: Create a materialized view with a RIGHT JOIN clause.
"""
defining_sql = f"""
SELECT t2.sample_id AS sample_id, t1.patient_id AS patient_id, t1.age AS age, t1.diagnosis AS diagnosis,
t2.gene AS gene, t2.expression_level AS expression_level
FROM {table1.id} t1
RIGHT JOIN {table2.id} t2
ON t1.sample_id = t2.sample_id
"""
materialized_view = MaterializedView(
name="Right Joined Patient Data View",
description="A materialized view with a RIGHT JOIN clause, including all gene expression data even if no patient matches",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with RIGHT JOIN:")
print(query_result)
def create_materialized_view_with_union():
"""
Example: Create a materialized view with a UNION clause.
"""
defining_sql = f"""
SELECT t1.sample_id AS sample_id
FROM {table1.id} t1
UNION
SELECT t2.sample_id AS sample_id
FROM {table2.id} t2
"""
materialized_view = MaterializedView(
name="Union Patient Data View",
description="A materialized view with a UNION clause",
parent_id=project_id,
defining_sql=defining_sql,
)
materialized_view = materialized_view.store()
print(f"Created Materialized View with ID: {materialized_view.id}")
materialized_view_id = materialized_view.id
query = f"SELECT * FROM {materialized_view_id}"
query_result: pd.DataFrame = materialized_view.query(
query=query, include_row_id_and_row_version=False
)
# Print the results to the console
print("Results from the materialized view with UNION:")
print(query_result)
def main():
create_materialized_view()
create_materialized_view_with_join()
create_materialized_view_with_left_join()
create_materialized_view_with_right_join()
create_materialized_view_with_union()
if __name__ == "__main__":
main()