Skip to content

Commit

Permalink
[Backport release-1.15][python] Set soma_joinid to always be an ind…
Browse files Browse the repository at this point in the history
…ex column in the `PointCloudDataFrame` class (#3575)

* [python] Set `soma_joinid` to always be an index column in the `PointCloudDataFrame` class (#3563)

* Add back soma_joinid to PointCloudDataFrame
* Update SOMA spatial encoding version

* lint

---------

Co-authored-by: Julia Dark <[email protected]>
  • Loading branch information
johnkerl and jp-dark authored Jan 16, 2025
1 parent 7987429 commit 20002bb
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 105 deletions.
165 changes: 87 additions & 78 deletions apis/python/notebooks/tutorial_spatial.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
SOMA_ENCODING_VERSION = "1.1.0"

SOMA_SPATIAL_VERSION_METADATA_KEY = "soma_spatial_encoding_version"
SOMA_SPATIAL_ENCODING_VERSION = "0.1.0"
SOMA_SPATIAL_ENCODING_VERSION = "0.2.0"


SPATIAL_DISCLAIMER = (
Expand Down
2 changes: 1 addition & 1 deletion apis/python/src/tiledbsoma/_multiscale_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def __init__(
spatial_encoding_version = self.metadata[SOMA_SPATIAL_VERSION_METADATA_KEY]
if isinstance(spatial_encoding_version, bytes):
spatial_encoding_version = str(spatial_encoding_version, "utf-8")
if spatial_encoding_version != "0.1.0":
if spatial_encoding_version not in {"0.1.0", "0.2.0"}:
raise ValueError(
f"Unsupported MultiscaleImage with spatial encoding version "
f"{spatial_encoding_version}"
Expand Down
10 changes: 6 additions & 4 deletions apis/python/src/tiledbsoma/_point_cloud_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from . import pytiledbsoma as clib
from ._constants import (
SOMA_COORDINATE_SPACE_METADATA_KEY,
SOMA_JOINID,
SPATIAL_DISCLAIMER,
)
from ._dataframe import (
Expand Down Expand Up @@ -85,8 +86,8 @@ def create(
``soma_joinid`` column is not provided, one will be added.
The schema of the created point cloud must contain columns for the axes in the
``coordinate_space``. These columns will be index columns for the point cloud
dataframe.
``coordinate_space``. These columns followed by the ``soma_joinid`` will be
index columns for the point cloud dataframe.
Args:
uri: The URI where the dataframe will be created.
Expand Down Expand Up @@ -123,7 +124,6 @@ def create(
axis_dtype: pa.DataType | None = None
if not isinstance(coordinate_space, CoordinateSpace):
coordinate_space = CoordinateSpace.from_axis_names(coordinate_space)
index_column_names = coordinate_space.axis_names
for column_name in coordinate_space.axis_names:
# Check axis column type is valid and all axis columns have the same type.
if axis_dtype is None:
Expand Down Expand Up @@ -151,6 +151,7 @@ def create(
) from ke
if column_dtype != axis_dtype:
raise ValueError("All spatial axes must have the same datatype.")
index_column_names = coordinate_space.axis_names + (SOMA_JOINID,)

context = _validate_soma_tiledb_context(context)
schema = _canonicalize_schema(schema, index_column_names)
Expand Down Expand Up @@ -184,7 +185,8 @@ def create(
nidx = len(index_column_names)
if ndom != nidx:
raise ValueError(
f"if domain is specified, it must have the same length as index_column_names; got {ndom} != {nidx}"
f"if domain is specified, it must have the same length as "
f"index_column_names; got {ndom} != {nidx}"
)

index_column_schema = []
Expand Down
12 changes: 9 additions & 3 deletions apis/python/src/tiledbsoma/io/spatial/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,9 +478,9 @@ def from_visium(
obs_df = (
exp.obs.read(column_names=["soma_joinid", "obs_id"]).concat().to_pandas()
)
x_layer = exp.ms[measurement_name].X[X_layer_name]
(len_obs_id, len_var_id) = x_layer.shape
if write_obs_spatial_presence or write_var_spatial_presence:
x_layer = exp.ms[measurement_name].X[X_layer_name]
(len_obs_id, len_var_id) = x_layer.shape
x_layer_data = x_layer.read().tables().concat()
if write_obs_spatial_presence:
obs_id = pacomp.unique(x_layer_data["soma_dim_0"])
Expand Down Expand Up @@ -569,6 +569,7 @@ def from_visium(
pixels_per_spot_diameter,
obs_df,
obs_id_name,
len_obs_id,
**ingest_ctx,
) as loc:
_maybe_set(obsl, "loc", loc, use_relative_uri=use_relative_uri)
Expand Down Expand Up @@ -684,6 +685,7 @@ def _write_visium_spots(
spot_diameter: float,
obs_df: pd.DataFrame,
id_column_name: str,
max_joinid_len: int,
*,
ingestion_params: IngestionParams,
additional_metadata: "AdditionalMetadata" = None,
Expand Down Expand Up @@ -711,7 +713,11 @@ def _write_visium_spots(
df = pd.merge(obs_df, df, how="inner", on=id_column_name)
df.drop(id_column_name, axis=1, inplace=True)

domain = ((df["x"].min(), df["x"].max()), (df["y"].min(), df["y"].max()))
domain = (
(df["x"].min(), df["x"].max()),
(df["y"].min(), df["y"].max()),
(0, max_joinid_len - 1),
)

arrow_table = df_to_arrow(df)

Expand Down
8 changes: 4 additions & 4 deletions apis/python/tests/test_basic_spatialdata_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def experiment_with_single_scene(tmp_path_factory, sample_2d_data) -> soma.Exper
("x_scene1", "y_scene1"), ("x", "y"), 2.0
),
schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]),
domain=[[0, 1], [0, 1]],
domain=[[0, 1], [0, 1], [0, 3]],
)
points1.write(
pa.Table.from_pydict(
Expand All @@ -71,7 +71,7 @@ def experiment_with_single_scene(tmp_path_factory, sample_2d_data) -> soma.Exper
("x_scene1", "y_scene1"), ("x", "y"), 4.0
),
schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]),
domain=[[-1, 0], [-1, 0]],
domain=[[-1, 0], [-1, 0], [0, 3]],
)
points3.write(
pa.Table.from_pydict(
Expand All @@ -91,7 +91,7 @@ def experiment_with_single_scene(tmp_path_factory, sample_2d_data) -> soma.Exper
("x_scene1", "y_scene1"), ("x", "y"), -1.0
),
schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]),
domain=[[-1, 0], [-1, 0]],
domain=[[-1, 0], [-1, 0], [0, 3]],
)
points2.write(
pa.Table.from_pydict(
Expand All @@ -111,7 +111,7 @@ def experiment_with_single_scene(tmp_path_factory, sample_2d_data) -> soma.Exper
("x_scene1", "y_scene1"), ("x", "y"), 0.25
),
schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]),
domain=[[0, 1], [0, 1]],
domain=[[0, 1], [0, 1], [0, 3]],
)
points4.write(
pa.Table.from_pydict(
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_experiment_query_spatial.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def add_point_cloud_dataframe(
subcoll,
transform=soma.IdentityTransform(("x", "y"), ("x", "y")),
schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]),
domain=[[-1, 1], [-1, 1]],
domain=[[-1, 1], [-1, 1], [0, 63]],
) as points:

if circles:
Expand Down
2 changes: 1 addition & 1 deletion apis/python/tests/test_export_point_cloud_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def sample_point_cloud_dataframe_2d(tmp_path_factory):
with soma.PointCloudDataFrame.create(
uri,
schema=pa.schema([("x", pa.float64()), ("y", pa.float64())]),
domain=[[0, 1], [0, 1]],
domain=[[0, 1], [0, 1], [0, 4]],
) as point_cloud:
x_data = np.array([0, 0, 0.5, 0.5], dtype=np.float64)
y_data = np.array([0, 0.5, 0, 0.5], dtype=np.float64)
Expand Down
26 changes: 14 additions & 12 deletions apis/python/tests/test_point_cloud_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_point_cloud_basic_read(tmp_path):
with soma.PointCloudDataFrame.create(
urljoin(baseuri, "default"),
schema=asch,
domain=[[-10000, 10000], [-10000, 10000]],
domain=[[-10000, 10000], [-10000, 10000], [0, 10]],
) as ptc:
pydict = {}
pydict["soma_joinid"] = [1, 2, 3, 4, 5]
Expand All @@ -50,7 +50,7 @@ def test_point_cloud_basic_read(tmp_path):

with soma.PointCloudDataFrame.open(urljoin(baseuri, "default"), "r") as ptc:
assert set(ptc.schema.names) == {"soma_joinid", "x", "y"}
assert ptc.index_column_names == ("x", "y")
assert ptc.index_column_names == ("x", "y", "soma_joinid")
assert ptc.axis_names == ("x", "y")

table = ptc.read().concat()
Expand All @@ -65,7 +65,7 @@ def test_point_cloud_basic_read(tmp_path):
urljoin(baseuri, "user_defined"),
schema=asch,
coordinate_space="x",
domain=((1, 10),),
domain=((1, 10), (0, 10)),
) as ptc:
pydict = {}
pydict["soma_joinid"] = [1, 2, 3, 4, 5]
Expand All @@ -77,9 +77,9 @@ def test_point_cloud_basic_read(tmp_path):

with soma.PointCloudDataFrame.open(urljoin(baseuri, "user_defined"), "r") as ptc:
assert set(ptc.schema.names) == set(["soma_joinid", "x", "y"])
assert ptc.index_column_names == ("x",)
assert ptc.index_column_names == ("x", "soma_joinid")
assert ptc.axis_names == ("x",)
assert ptc.domain == ((1, 10),)
assert ptc.domain == ((1, 10), (0, 10))

table = ptc.read().concat()
assert ptc.count == len(ptc) == table.num_rows == 5
Expand Down Expand Up @@ -143,7 +143,7 @@ def test_point_cloud_bad_read_spatial_region(tmp_path):
schema = pa.schema([("x", pa.float64()), ("y", pa.float64())])

with soma.PointCloudDataFrame.create(
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000]]
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000], [0, 10]]
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
Expand Down Expand Up @@ -299,7 +299,7 @@ def test_point_cloud_read_spatial_region_basic_2d(
schema = pa.schema([("x", pa.float64()), ("y", pa.float64())])

with soma.PointCloudDataFrame.create(
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000]]
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000], [0, 10]]
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
Expand Down Expand Up @@ -424,7 +424,7 @@ def test_point_cloud_read_spatial_region_basic_3d(
uri,
schema=schema,
coordinate_space=("x", "y", "z"),
domain=[[-10000, 10000], [-10000, 10000], [-10000, 10000]],
domain=[[-10000, 10000], [-10000, 10000], [-10000, 10000], [0, 10]],
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
Expand Down Expand Up @@ -455,7 +455,7 @@ def test_point_cloud_read_spatial_region_2d_bad(tmp_path, name, region, exc_type
schema = pa.schema([("x", pa.float64()), ("y", pa.float64())])

with soma.PointCloudDataFrame.create(
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000]]
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000], [0, 10]]
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
Expand Down Expand Up @@ -485,7 +485,9 @@ def test_point_cloud_read_spatial_region_3d_bad(tmp_path, name, region, exc_type

schema = pa.schema([("x", pa.float64()), ("y", pa.float64()), ("z", pa.float64())])

with soma.PointCloudDataFrame.create(uri, schema=schema, domain=[[0, 9]]) as ptc:
with soma.PointCloudDataFrame.create(
uri, schema=schema, domain=[[0, 9], [0, 10]]
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
"x": [10, 20, 30, 40, 50],
Expand All @@ -504,7 +506,7 @@ def point_cloud_read_spatial_region_transform_setup(uri, transform, input_axes,
schema = pa.schema([("x", pa.float64()), ("y", pa.float64())])

with soma.PointCloudDataFrame.create(
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000]]
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000], [0, 10]]
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
Expand Down Expand Up @@ -707,7 +709,7 @@ def test_point_cloud_read_spatial_region_region_coord_space(tmp_path):
schema = pa.schema([("x", pa.float64()), ("y", pa.float64())])

with soma.PointCloudDataFrame.create(
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000]]
uri, schema=schema, domain=[[-10000, 10000], [-10000, 10000], [0, 10]]
) as ptc:
pydict = {
"soma_joinid": [1, 2, 3, 4, 5],
Expand Down

0 comments on commit 20002bb

Please sign in to comment.