Source code for apybiomart.classes

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Created by Roberto Preste
import io
from typing import Optional, Dict, Any, Tuple, Generator, List, Union
from xml.etree import ElementTree as ET

import asyncio  # noqa
import aiohttp
import requests
import numpy as np
import pandas as pd


class _BiomartException(Exception):
    """Basic exception class for BioMart exceptions."""
    pass


class _Server:
    """Basic server class used to call BioMart using sync or async calls.

    Attributes:
        host: URL to connect to
        save: save results to a CSV file [default: False]
        output: output filename if saving results [default: None]
    """

    def __init__(self,
                 host: str = "http://www.ensembl.org/biomart/martservice",
                 save: bool = False,
                 output: Optional[str] = None):
        self.host = host
        self.save = save
        self.output = output
        if not self._check_connection():
            raise _BiomartException("No internet connection available!")

    @staticmethod
    def _check_connection() -> bool:
        """Check for a functioning internet connection.

        Returns:
            bool
        """
        url = "https://httpstat.us/200"
        timeout = 5
        try:
            _ = requests.get(url, timeout=timeout)
            return True
        except requests.exceptions.RequestException:
            pass
        return False

    def get_sync(self,
                 **params: Optional[str]):
        """Synchronous call to BioMart.

        Keyword Args:
            params: keyword arguments for the requests call

        Returns:
            request call to self.host with given params
        """
        resp = requests.get(self.host, params=params)

        return resp

    async def get_async(self,
                        **params: Optional[str]):
        """Asynchronous call to BioMart.

        Keyword Args:
            params: keyword arguments for the async call

        Returns:
            asynchronous call to self.host with given params
        """
        async with aiohttp.ClientSession() as session:
            async with session.get(self.host, params=params) as resp:
                return await resp.text()


[docs]class MartServer(_Server):
    """Class used to retrieve and list available marts."""

    def __init__(self,
                 save: bool = False,
                 output: str = "apybiomart_marts.csv"):
        super().__init__(save=save, output=output)

[docs]    def find_marts(self) -> pd.DataFrame:
        """Return the list of available marts as a dataframe.

        Returns:
            pd.DataFrame
        """
        df = pd.DataFrame.from_records(self._fetch_marts(),
                                       columns=["name", "display_name"])
        df.columns = ["Mart_ID", "Mart_name"]
        df.replace(np.nan, "", inplace=True)
        if self.save:
            df.to_csv(self.output, index=False)

        return df

    def _fetch_marts(self) -> Dict[str, Tuple[Any]]:
        """Retrieve the available marts from BioMart.

        Call BioMart to retrieve the available marts and return the
        internal dict used to parse them by self.list_marts().

        Returns:
            dictionary from parsed xml
        """
        resp = self.get_sync(type="registry")
        xml = ET.fromstring(resp.content)
        marts = list(zip(*self._mart_from_xml(xml)))

        return {"name": marts[0],
                "display_name": marts[1]}

    @staticmethod
    def _mart_from_xml(xml):
        """Extract mart information from XML.

        Parse the xml to extract name and display name of each mart.

        Args:
            xml: ElementTree retrieved from Biomart

        Returns:
            generator for each node in the xml
        """
        for child in xml.findall("MartURLLocation"):
            yield (child.attrib["name"],
                   child.attrib["displayName"])


[docs]class DatasetServer(_Server):
    """Class used to retrieve and list available datasets for a mart.

    Attributes:
        mart: BioMart mart name
    """

    def __init__(self,
                 mart: str,
                 save: bool = False,
                 output: str = "apybiomart_datasets.csv"):
        super().__init__(save=save, output=output)
        self.mart = mart

[docs]    def find_datasets(self) -> pd.DataFrame:
        """Return the list of available datasets for a specific mart as a
        dataframe."""
        df = pd.read_csv(self._fetch_datasets(),
                         sep="\t",
                         # TODO: look for proper names in Biomart documentation
                         names=["type", "name", "display_name", "unknown",
                                "version", "unknown3", "unknown4",
                                "virtual_schema", "unknown5"],
                         usecols=["name", "display_name"])
        df["mart"] = self.mart
        df.columns = ["Dataset_ID", "Dataset_name", "Mart_ID"]
        df.replace(np.nan, "", inplace=True)
        if self.save:
            df.to_csv(self.output, index=False)

        return df

    def _fetch_datasets(self) -> io.StringIO:
        """Retrieve available datasets for a mart.

        Call BioMart to retrieve the available datasets for a specific
        mart and return the internal string used to parse them by
        self.list_datasets().

        Returns:
            io.StringIO element from response text
        """
        resp = self.get_sync(type="datasets", mart=self.mart)

        return io.StringIO(resp.text)


[docs]class AttributesServer(_Server):
    """Class used to retrieve and list available attributes for a dataset.

    Attributes:
        dataset: BioMart dataset name
    """

    def __init__(self,
                 dataset: str,
                 save: bool = False,
                 output: str = "apybiomart_attributes.csv"):
        super().__init__(save=save, output=output)
        self.dataset = dataset

[docs]    def find_attributes(self) -> pd.DataFrame:
        """Return the list of available attributes for a specific dataset as
        a dataframe."""
        df = pd.DataFrame.from_records(self._fetch_attributes(),
                                       columns=["name",
                                                "display_name",
                                                "description"])
        df["dataset"] = self.dataset
        df.columns = ["Attribute_ID", "Attribute_name",
                      "Attribute_description", "Dataset_ID"]
        df.replace(np.nan, "", inplace=True)
        if self.save:
            df.to_csv(self.output, index=False)

        return df

    def _fetch_attributes(self) -> Dict[str, Tuple[Any]]:
        """Retrieve available attributes for a dataset.

        Call BioMart to retrieve the available attributes for a specific
        dataset and return the internal dict used to parse them by
        self.list_attributes().

        Returns:
            dictionary from parsed xml
        """
        resp = self.get_sync(type="configuration", dataset=self.dataset)
        xml = ET.fromstring(resp.content)
        attribs = list(zip(*self._attributes_from_xml(xml)))

        return {"name": attribs[0],
                "display_name": attribs[1],
                "description": attribs[2]}

    @staticmethod
    def _attributes_from_xml(xml) -> Generator[str, Any, Any]:
        """Extract attributes information from XML.

        Parse the xml to extract name, display name and description
        of each attribute.

        Args:
            xml: ElementTree retrieved from Biomart

        Returns:
            generator for each node in the xml
        """
        for page in xml.iter("AttributePage"):
            for desc in page.iter("AttributeDescription"):
                attrib = desc.attrib

                yield (attrib["internalName"],
                       attrib.get("displayName", ""),
                       attrib.get("description", ""))


[docs]class FiltersServer(_Server):
    """Class used to retrieve and list available filters for a dataset.

    Attributes:
        dataset: BioMart dataset name
    """

    def __init__(self,
                 dataset: str,
                 save: bool = False,
                 output: str = "apybiomart_filters.csv"):
        super().__init__(save=save, output=output)
        self.dataset = dataset

[docs]    def find_filters(self) -> pd.DataFrame:
        """Return the list of available filters for a specific dataset as
        a dataframe."""
        df = pd.DataFrame.from_records(self._fetch_filters(),
                                       columns=["name",
                                                "type",
                                                "description"])
        df["dataset"] = self.dataset
        df.columns = ["Filter_ID", "Filter_type",
                      "Filter_description", "Dataset_ID"]
        df.replace(np.nan, "", inplace=True)
        if self.save:
            df.to_csv(self.output, index=False)

        return df

    def _fetch_filters(self) -> Dict[str, Tuple[Any]]:
        """Retrieve available filters for a dataset.

        Call BioMart to retrieve the available filters for a specific
        dataset and return the internal dict used to parse them by
        self.list_filters().

        Returns:
            dictionary from parsed xml
        """
        resp = self.get_sync(type="configuration", dataset=self.dataset)
        xml = ET.fromstring(resp.content)
        filters = list(zip(*self._filters_from_xml(xml)))

        return {"name": filters[0],
                "type": filters[1],
                "description": filters[2]}

    @staticmethod
    def _filters_from_xml(xml) -> Generator[str, Any, Any]:
        """Extract filters information from XML.

        Parse the xml to extract name, type and description of each
        filter.

        Args:
            xml: ElementTree retrieved from Biomart

        Returns:
            generator for each node in the xml
        """
        for node in xml.iter("FilterDescription"):
            filt = node.attrib
            yield (filt["internalName"],
                   filt.get("type", ""),
                   filt.get("description", ""))


[docs]class Query(_Server):
    """Class used to perform either synchronous or asynchronous queries on
    BioMart.

    Attributes:
        attributes: list of attributes to include
        filters: dict of filter name : value to filter results
        dataset: BioMart dataset name
    """

    def __init__(self,
                 attributes: List[str],
                 filters: Dict[str, Union[str, int, list, tuple, bool]],
                 dataset: str,
                 save: bool = False,
                 output: str = "apybiomart_query.csv"):
        super().__init__(save=save, output=output)
        self.attributes = attributes
        self.filters = filters
        self.dataset = dataset

[docs]    def query(self) -> pd.DataFrame:
        """Perform synchronous query.

        Return the result of the query based on the given attributes,
        filters and optional dataset using Server.get_sync(), as a pandas
        DataFrame.
        """
        # Setup query element.
        root = ET.Element("Query")
        root.set("virtualSchemaName", "default")
        root.set("formatter", "TSV")
        root.set("header", "1")
        root.set("datasetConfigVersion", "0.6")
        # Add dataset element.
        dataset = ET.SubElement(root, "Dataset")
        dataset.set("name", self.dataset)
        dataset.set("interface", "default")

        # Add attribute elements.
        for name in self.attributes:
            try:
                self._add_attr_node(dataset, name)
            except KeyError:
                raise _BiomartException(
                    "Unknown attribute {}, check dataset attributes "
                    "for a list of valid attributes.".format(name))

        if self.filters is not None:
            # Add filter elements.
            for name, value in self.filters.items():
                try:
                    self._add_filter_node(dataset, name, value)
                except KeyError:
                    raise _BiomartException(
                        "Unknown filter {}, check dataset filters "
                        "for a list of valid filters.".format(name))

        resp = self.get_sync(query=str(ET.tostring(root), "utf-8"))

        if "Query ERROR" in resp.text:
            raise _BiomartException(resp.text)

        try:
            result = pd.read_csv(io.StringIO(resp.text), sep="\t")
        # Type error is raised of a data type is not understood by pandas
        except TypeError:
            raise ValueError("Non valid data type is used in dtypes")
        result.replace(np.nan, "", inplace=True)

        if self.save:
            result.to_csv(self.output, index=False)

        return result

[docs]    async def aquery(self) -> pd.DataFrame:
        """Perform asynchronous query.

        Return the result of the query based on the given attributes,
        filters and optional dataset using Server.get_async(), as a pandas
        DataFrame.
        """
        # Setup query element.
        root = ET.Element("Query")
        root.set("virtualSchemaName", "default")
        root.set("formatter", "TSV")
        root.set("header", "1")
        root.set("datasetConfigVersion", "0.6")
        # Add dataset element.
        dataset = ET.SubElement(root, "Dataset")
        dataset.set("name", self.dataset)
        dataset.set("interface", "default")

        # Add attribute elements.
        for name in self.attributes:
            try:
                self._add_attr_node(dataset, name)
            except KeyError:
                raise _BiomartException(
                    "Unknown attribute {}, check dataset attributes "
                    "for a list of valid attributes.".format(name))

        if self.filters is not None:
            # Add filter elements.
            for name, value in self.filters.items():
                try:
                    self._add_filter_node(dataset, name, value)
                except KeyError:
                    raise _BiomartException(
                        "Unknown filter {}, check dataset filters "
                        "for a list of valid filters.".format(name))

        resp = await self.get_async(query=str(ET.tostring(root), "utf-8"))

        if "Query ERROR" in resp:
            raise _BiomartException(resp)

        try:
            result = pd.read_csv(io.StringIO(resp), sep="\t")
        # Type error is raised of a data type is not understood by pandas
        except TypeError:
            raise ValueError("Non valid data type is used in dtypes")
        result.replace(np.nan, "", inplace=True)

        if self.save:
            result.to_csv(self.output, index=False)

        return result

    @staticmethod
    def _add_attr_node(root, attr: str):
        """Add the given attribute name to the dataset ElementTree sub-element.

        Args:
            root: dataset sub-element root node
            attr: attribute name
        """
        attr_el = ET.SubElement(root, "Attribute")
        attr_el.set("name", attr)

    @staticmethod
    def _add_filter_node(root,
                         name: str,
                         value: Union[str, int, list, tuple, bool]):
        """Add the given filter name and value to the dataset ElementTree
        sub-element.

        Args:
            root: dataset sub-element root node
            name: filter name
            value: filter value
        """
        filter_el = ET.SubElement(root, "Filter")
        filter_el.set("name", name)

        # TODO
        # Set filter value depending on type.
        # Boolean case
        if isinstance(value, bool):
            if value is True:
                filter_el.set("excluded", "0")
            else:
                filter_el.set("excluded", "1")
        # List case
        elif isinstance(value, list) or isinstance(value, tuple):
            filter_el.set("value", ",".join(map(str, value)))
        # String case
        elif isinstance(value, str):
            if value.lower() in ("included", "only"):
                filter_el.set("excluded", "0")
            elif value.lower() == "excluded":
                filter_el.set("excluded", "1")
            else:
                filter_el.set("value", value)
        # Mostly int case
        else:
            filter_el.set("value", str(value))