[ramdisk] add cvitek pre-built ramdisk
Change-Id: Ic7d2046a23358129eaf621b5558984a64fa7361d
This commit is contained in:
@ -0,0 +1,71 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from tabledata import (
|
||||
SQLiteTableDataSanitizer,
|
||||
TableData,
|
||||
TableDataSanitizer,
|
||||
|
||||
InvalidTableNameError,
|
||||
InvalidHeaderNameError,
|
||||
InvalidDataError,
|
||||
EmptyDataError,
|
||||
)
|
||||
|
||||
from ._constant import PatternMatch
|
||||
from ._logger import (
|
||||
logger,
|
||||
set_logger,
|
||||
set_log_level,
|
||||
)
|
||||
from .csv.core import (
|
||||
CsvTableFileLoader,
|
||||
CsvTableTextLoader,
|
||||
)
|
||||
from .error import (
|
||||
ValidationError,
|
||||
InvalidPathError,
|
||||
InvalidFilePathError,
|
||||
InvalidUrlError,
|
||||
OpenError,
|
||||
LoaderNotFoundError,
|
||||
HTTPError,
|
||||
ProxyError,
|
||||
PypandocImportError,
|
||||
)
|
||||
from .html.core import (
|
||||
HtmlTableFileLoader,
|
||||
HtmlTableTextLoader,
|
||||
)
|
||||
from .json.core import (
|
||||
JsonTableFileLoader,
|
||||
JsonTableTextLoader,
|
||||
)
|
||||
from .loadermanager import (
|
||||
TableFileLoader,
|
||||
TableUrlLoader,
|
||||
)
|
||||
from .ltsv.core import (
|
||||
LtsvTableFileLoader,
|
||||
LtsvTableTextLoader,
|
||||
)
|
||||
from .markdown.core import (
|
||||
MarkdownTableFileLoader,
|
||||
MarkdownTableTextLoader,
|
||||
)
|
||||
from .mediawiki.core import (
|
||||
MediaWikiTableFileLoader,
|
||||
MediaWikiTableTextLoader,
|
||||
)
|
||||
from .spreadsheet.excelloader import ExcelTableFileLoader
|
||||
from .spreadsheet.gsloader import GoogleSheetsTableLoader
|
||||
from .sqlite.core import SqliteFileLoader
|
||||
from .tsv.core import (
|
||||
TsvTableFileLoader,
|
||||
TsvTableTextLoader,
|
||||
)
|
||||
Binary file not shown.
@ -0,0 +1,34 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import abc
|
||||
|
||||
import six
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class LoaderAcceptorInterface(object):
|
||||
"""
|
||||
An interface class of table loader acceptor.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def accept(self, loader): # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class LoaderAcceptor(LoaderAcceptorInterface):
|
||||
"""
|
||||
An abstract class of table loader acceptor.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loader = None
|
||||
|
||||
def accept(self, loader):
|
||||
self._loader = loader
|
||||
Binary file not shown.
@ -0,0 +1,57 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os.path
|
||||
import posixpath
|
||||
|
||||
import pathvalidate
|
||||
import typepy
|
||||
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from .error import InvalidFilePathError
|
||||
|
||||
|
||||
def get_extension(file_path):
|
||||
if typepy.is_null_string(file_path):
|
||||
raise InvalidFilePathError("file path is empty")
|
||||
|
||||
return os.path.splitext(file_path)[1].lstrip(".")
|
||||
|
||||
|
||||
def convert_idx_to_alphabet(column_idx):
|
||||
if column_idx < 26:
|
||||
return chr(65 + column_idx)
|
||||
|
||||
return (
|
||||
convert_idx_to_alphabet(int(column_idx / 26 - 1)) +
|
||||
convert_idx_to_alphabet(column_idx % 26))
|
||||
|
||||
|
||||
def make_temp_file_path_from_url(temp_dir_path, url):
|
||||
try:
|
||||
url_path = urlparse(url).path
|
||||
except AttributeError:
|
||||
raise InvalidFilePathError("url must be a string")
|
||||
|
||||
if typepy.is_null_string(url_path):
|
||||
raise InvalidFilePathError("invalid URL path: {}".format(url_path))
|
||||
|
||||
temp_name = os.path.basename(url_path.rstrip("/"))
|
||||
if typepy.is_null_string(temp_name):
|
||||
temp_name = pathvalidate.replace_symbol(
|
||||
temp_name, replacement_text="_")
|
||||
|
||||
if typepy.is_null_string(temp_name):
|
||||
raise InvalidFilePathError("invalid URL: {}".format(url))
|
||||
|
||||
try:
|
||||
return posixpath.join(temp_dir_path, temp_name)
|
||||
except (TypeError, AttributeError):
|
||||
raise InvalidFilePathError("temp_dir_path must be a string")
|
||||
Binary file not shown.
@ -0,0 +1,38 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import enum
|
||||
|
||||
|
||||
class Default(object):
|
||||
ENCODING = "utf-8"
|
||||
|
||||
|
||||
class SourceType(object):
|
||||
TEXT = "text"
|
||||
FILE = "file"
|
||||
URL = "url"
|
||||
|
||||
|
||||
class TableNameTemplate(object):
|
||||
__FORMAT = "%({:s})s"
|
||||
DEFAULT = __FORMAT.format("default")
|
||||
FILENAME = __FORMAT.format("filename")
|
||||
FORMAT_NAME = __FORMAT.format("format_name")
|
||||
FORMAT_ID = __FORMAT.format("format_id")
|
||||
GLOBAL_ID = __FORMAT.format("global_id")
|
||||
KEY = __FORMAT.format("key")
|
||||
TITLE = __FORMAT.format("title")
|
||||
SHEET = __FORMAT.format("sheet")
|
||||
|
||||
|
||||
@enum.unique
|
||||
class PatternMatch(enum.Enum):
|
||||
OR = 0
|
||||
AND = 1
|
||||
Binary file not shown.
@ -0,0 +1,119 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
|
||||
import dataproperty
|
||||
import logbook
|
||||
import six
|
||||
|
||||
|
||||
logger = logbook.Logger("pytablereader")
|
||||
logger.disable()
|
||||
|
||||
|
||||
def set_logger(is_enable):
|
||||
if is_enable != logger.disabled:
|
||||
return
|
||||
|
||||
if is_enable:
|
||||
logger.enable()
|
||||
else:
|
||||
logger.disable()
|
||||
|
||||
dataproperty.set_logger(is_enable)
|
||||
|
||||
try:
|
||||
import simplesqlite
|
||||
|
||||
simplesqlite.set_logger(is_enable)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
def set_log_level(log_level):
|
||||
"""
|
||||
Set logging level of this module. Using
|
||||
`logbook <http://logbook.readthedocs.io/en/stable/>`__ module for logging.
|
||||
|
||||
:param int log_level:
|
||||
One of the log level of
|
||||
`logbook <http://logbook.readthedocs.io/en/stable/api/base.html>`__.
|
||||
Disabled logging if ``log_level`` is ``logbook.NOTSET``.
|
||||
:raises LookupError: If ``log_level`` is an invalid value.
|
||||
"""
|
||||
|
||||
# validate log level
|
||||
logbook.get_level_name(log_level)
|
||||
|
||||
if log_level == logger.level:
|
||||
return
|
||||
|
||||
if log_level == logbook.NOTSET:
|
||||
set_logger(is_enable=False)
|
||||
else:
|
||||
set_logger(is_enable=True)
|
||||
|
||||
logger.level = log_level
|
||||
dataproperty.set_log_level(log_level)
|
||||
|
||||
try:
|
||||
import simplesqlite
|
||||
|
||||
simplesqlite.set_log_level(log_level)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class LoggerInterface(object):
|
||||
|
||||
@abc.abstractmethod
|
||||
def logging_load(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class BaseLogger(LoggerInterface):
|
||||
|
||||
def __init__(self, loader):
|
||||
self._loader = loader
|
||||
|
||||
|
||||
class FileSourceLogger(BaseLogger):
|
||||
|
||||
def logging_load(self):
|
||||
message = "loading {:s}: format={:s}, path={}".format(
|
||||
self._loader.source_type, self._loader.format_name,
|
||||
self._loader.source)
|
||||
|
||||
try:
|
||||
message += ", encoding={}".format(self._loader.encoding)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
logger.debug(message)
|
||||
|
||||
|
||||
class TextSourceLogger(BaseLogger):
|
||||
|
||||
def logging_load(self):
|
||||
message = "loading {:s} {:s}".format(
|
||||
self._loader.format_name, self._loader.source_type)
|
||||
|
||||
try:
|
||||
message += ", len={}".format(len(self._loader.source))
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
message += ", encoding={}".format(self._loader.encoding)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
logger.debug(message)
|
||||
Binary file not shown.
@ -0,0 +1,107 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
import os.path
|
||||
|
||||
from pytablereader import EmptyDataError
|
||||
import six
|
||||
import typepy
|
||||
|
||||
import pathvalidate as pv
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from ._constant import SourceType
|
||||
from .error import (
|
||||
InvalidFilePathError,
|
||||
InvalidUrlError
|
||||
)
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class ValidatorInterface(object):
|
||||
"""
|
||||
An interface class for data source validator.
|
||||
"""
|
||||
|
||||
@abc.abstractproperty
|
||||
def source_type(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def validate(self):
|
||||
pass
|
||||
|
||||
|
||||
class BaseValidator(ValidatorInterface):
|
||||
"""
|
||||
An abstract base class for data source validator.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source(self):
|
||||
return self.__source
|
||||
|
||||
def __init__(self, source):
|
||||
self.__source = source
|
||||
|
||||
|
||||
class FileValidator(BaseValidator):
|
||||
"""
|
||||
Validator class for file data source.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return SourceType.FILE
|
||||
|
||||
def validate(self):
|
||||
try:
|
||||
pv.validate_file_path(self.source)
|
||||
except pv.NullNameError:
|
||||
raise InvalidFilePathError("file path is empty")
|
||||
except (ValueError, pv.InvalidCharError, pv.InvalidLengthError) as e:
|
||||
raise InvalidFilePathError(e)
|
||||
|
||||
if not os.path.isfile(self.source):
|
||||
raise IOError("file not found")
|
||||
|
||||
|
||||
class TextValidator(BaseValidator):
|
||||
"""
|
||||
Validator class for text object data source.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return SourceType.TEXT
|
||||
|
||||
def validate(self):
|
||||
if typepy.is_null_string(self.source):
|
||||
raise EmptyDataError("data source is empty")
|
||||
|
||||
|
||||
class UrlValidator(BaseValidator):
|
||||
"""
|
||||
Validator class for URL data source.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return SourceType.URL
|
||||
|
||||
def validate(self):
|
||||
if typepy.is_null_string(self.source):
|
||||
raise InvalidUrlError("url is empty")
|
||||
|
||||
scheme = urlparse(self.source).scheme
|
||||
if scheme not in ["http", "https"]:
|
||||
raise InvalidUrlError(
|
||||
"invalid scheme: expected=http/https, actual={}".format(
|
||||
scheme))
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,244 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import csv
|
||||
import io
|
||||
import platform
|
||||
|
||||
from mbstrdecoder import MultiByteStrDecoder
|
||||
from pytablereader import InvalidDataError
|
||||
import six
|
||||
import typepy
|
||||
|
||||
from .._constant import (
|
||||
Default,
|
||||
TableNameTemplate as tnt,
|
||||
)
|
||||
from .._logger import (
|
||||
FileSourceLogger,
|
||||
TextSourceLogger,
|
||||
)
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..interface import TableLoader
|
||||
from .formatter import CsvTableFormatter
|
||||
|
||||
|
||||
class CsvTableLoader(TableLoader):
|
||||
"""
|
||||
The abstract class of CSV table loaders.
|
||||
|
||||
.. py:attribute:: header_list
|
||||
|
||||
Attribute names of the table. Use the first line of
|
||||
the CSV file as attribute list if header_list is empty.
|
||||
|
||||
.. py:attribute:: delimiter
|
||||
|
||||
A one-character string used to separate fields.
|
||||
Defaults to ``","``.
|
||||
|
||||
.. py:attribute:: quotechar
|
||||
|
||||
A one-character string used to quote fields containing
|
||||
special characters, such as the ``delimiter`` or ``quotechar``,
|
||||
or which contain new-line characters.
|
||||
Defaults to ``'"'``.
|
||||
|
||||
.. py:attribute:: encoding
|
||||
|
||||
Encoding of the CSV data.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "csv"
|
||||
|
||||
@property
|
||||
def delimiter(self):
|
||||
# "delimiter" must be a string, not an unicode
|
||||
return str(MultiByteStrDecoder(self.__delimiter).unicode_str)
|
||||
|
||||
@delimiter.setter
|
||||
def delimiter(self, value):
|
||||
self.__delimiter = value
|
||||
|
||||
@property
|
||||
def quotechar(self):
|
||||
# "quotechar" must be a string, not an unicode
|
||||
return str(MultiByteStrDecoder(self.__quotechar).unicode_str)
|
||||
|
||||
@quotechar.setter
|
||||
def quotechar(self, value):
|
||||
self.__quotechar = value
|
||||
|
||||
def __init__(self, source):
|
||||
super(CsvTableLoader, self).__init__(source)
|
||||
|
||||
self._csv_reader = None
|
||||
|
||||
self.header_list = ()
|
||||
self.delimiter = ","
|
||||
self.quotechar = '"'
|
||||
self.encoding = Default.ENCODING
|
||||
|
||||
def _to_data_matrix(self):
|
||||
try:
|
||||
return [
|
||||
[self.__modify_item(data) for data in row]
|
||||
for row in self._csv_reader
|
||||
if typepy.is_not_empty_sequence(row)
|
||||
]
|
||||
except csv.Error as e:
|
||||
raise InvalidDataError(e)
|
||||
|
||||
@staticmethod
|
||||
def __modify_item(data):
|
||||
try:
|
||||
return typepy.type.Integer(data).convert()
|
||||
except typepy.TypeConversionError:
|
||||
pass
|
||||
|
||||
try:
|
||||
return typepy.type.RealNumber(data).convert()
|
||||
except typepy.TypeConversionError:
|
||||
pass
|
||||
|
||||
return MultiByteStrDecoder(data).unicode_str
|
||||
|
||||
|
||||
class CsvTableFileLoader(CsvTableLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from CSV files.
|
||||
|
||||
:param str file_path: Path to the loading CSV file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s``.
|
||||
|
||||
:Examples:
|
||||
:ref:`example-csv-table-loader`
|
||||
"""
|
||||
|
||||
def __init__(self, file_path):
|
||||
super(CsvTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a CSV file.
|
||||
|load_source_desc_file|
|
||||
|
||||
:return:
|
||||
Loaded table data.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ========================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ========================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(format_name)s`` ``"csv"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ========================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the CSV data is invalid.
|
||||
|
||||
.. seealso::
|
||||
:py:func:`csv.reader`
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
if all([platform.system() == "Windows", six.PY3]):
|
||||
self._csv_reader = csv.reader(
|
||||
io.open(self.source, "r", encoding=self.encoding),
|
||||
delimiter=self.delimiter, quotechar=self.quotechar,
|
||||
strict=True, skipinitialspace=True)
|
||||
else:
|
||||
self._csv_reader = csv.reader(
|
||||
open(self.source, "r"),
|
||||
delimiter=self.delimiter, quotechar=self.quotechar,
|
||||
strict=True, skipinitialspace=True)
|
||||
|
||||
formatter = CsvTableFormatter(self._to_data_matrix())
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return tnt.FILENAME
|
||||
|
||||
|
||||
class CsvTableTextLoader(CsvTableLoader):
|
||||
"""
|
||||
A text loader class to extract tabular data from CSV text data.
|
||||
|
||||
:param str text: CSV text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(format_name)s%(format_id)s``.
|
||||
|
||||
:Examples:
|
||||
:ref:`example-csv-table-loader`
|
||||
"""
|
||||
|
||||
def __init__(self, text):
|
||||
super(CsvTableTextLoader, self).__init__(text)
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
self._logger = TextSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a CSV text object.
|
||||
|load_source_desc_text|
|
||||
|
||||
:return:
|
||||
Loaded table data.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ========================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ========================================
|
||||
``%(filename)s`` ``""``
|
||||
``%(format_name)s`` ``"csv"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ========================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the CSV data is invalid.
|
||||
|
||||
.. seealso::
|
||||
:py:func:`csv.reader`
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
self._csv_reader = csv.reader(
|
||||
six.StringIO(self.source.strip()),
|
||||
delimiter=self.delimiter, quotechar=self.quotechar,
|
||||
strict=True, skipinitialspace=True)
|
||||
formatter = CsvTableFormatter(self._to_data_matrix())
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}{:s}".format(tnt.FORMAT_NAME, tnt.FORMAT_ID)
|
||||
Binary file not shown.
@ -0,0 +1,44 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pytablereader import InvalidDataError
|
||||
from tabledata import TableData
|
||||
import typepy
|
||||
|
||||
from ..formatter import TableFormatter
|
||||
|
||||
|
||||
class CsvTableFormatter(TableFormatter):
|
||||
|
||||
def to_table_data(self):
|
||||
if typepy.is_empty_sequence(self._loader.header_list):
|
||||
header_list = self._source_data[0]
|
||||
|
||||
if any([
|
||||
typepy.is_null_string(header) for header in header_list
|
||||
]):
|
||||
raise InvalidDataError(
|
||||
"the first line includes empty string item."
|
||||
"all of the items should contain header name."
|
||||
"actual={}".format(header_list))
|
||||
|
||||
data_matrix = self._source_data[1:]
|
||||
else:
|
||||
header_list = self._loader.header_list
|
||||
data_matrix = self._source_data
|
||||
|
||||
if not data_matrix:
|
||||
raise InvalidDataError(
|
||||
"data row must be greater or equal than one")
|
||||
|
||||
self._loader.inc_table_count()
|
||||
|
||||
yield TableData(
|
||||
self._loader.make_table_name(), header_list, data_matrix,
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
Binary file not shown.
@ -0,0 +1,71 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class ValidationError(Exception):
|
||||
"""
|
||||
Exception raised when data is not properly formatted.
|
||||
"""
|
||||
|
||||
|
||||
class InvalidPathError(Exception):
|
||||
"""
|
||||
Base path exception class.
|
||||
"""
|
||||
|
||||
|
||||
class InvalidFilePathError(InvalidPathError):
|
||||
"""
|
||||
Exception raised when invalid file path used.
|
||||
"""
|
||||
|
||||
|
||||
class InvalidUrlError(InvalidPathError):
|
||||
"""
|
||||
Exception raised when invalid URL used.
|
||||
"""
|
||||
|
||||
|
||||
class OpenError(IOError):
|
||||
"""
|
||||
Exception raised when failed to open a file.
|
||||
"""
|
||||
|
||||
|
||||
class LoaderNotFoundError(Exception):
|
||||
"""
|
||||
Exception raised when loader not found.
|
||||
"""
|
||||
|
||||
|
||||
class HTTPError(requests.RequestException):
|
||||
"""
|
||||
An HTTP error occurred.
|
||||
|
||||
.. seealso::
|
||||
|
||||
http://docs.python-requests.org/en/master/api/#exceptions
|
||||
"""
|
||||
|
||||
|
||||
class ProxyError(requests.exceptions.ProxyError):
|
||||
"""
|
||||
A proxy error occurred.
|
||||
|
||||
.. seealso::
|
||||
|
||||
http://docs.python-requests.org/en/master/_modules/requests/exceptions/
|
||||
"""
|
||||
|
||||
|
||||
class PypandocImportError(ImportError):
|
||||
"""
|
||||
Exception raised when import error occurred with pypandoc package.
|
||||
"""
|
||||
Binary file not shown.
@ -0,0 +1,10 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from ._file import TableFileLoaderFactory
|
||||
from ._url import TableUrlLoaderFactory
|
||||
Binary file not shown.
@ -0,0 +1,113 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
|
||||
import six
|
||||
|
||||
from .._constant import Default
|
||||
from ..error import LoaderNotFoundError
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class BaseTableLoaderFactory(object):
|
||||
|
||||
@property
|
||||
def source(self):
|
||||
"""
|
||||
:return: Data source to load.
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
return self._source
|
||||
|
||||
def __init__(self, source, encoding=Default.ENCODING):
|
||||
self._source = source
|
||||
|
||||
if not encoding:
|
||||
self._encoding = Default.ENCODING
|
||||
else:
|
||||
self._encoding = encoding
|
||||
|
||||
@abc.abstractmethod
|
||||
def create_from_path(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def create_from_format_name(self, format_name): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get_extension_loader_mapping(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get_format_name_loader_mapping(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
def get_format_name_list(self):
|
||||
"""
|
||||
:return: Available format names.
|
||||
:rtype: list
|
||||
"""
|
||||
|
||||
return sorted(self._get_format_name_loader_mapping())
|
||||
|
||||
def get_extension_list(self):
|
||||
"""
|
||||
:return: Available format file extensions.
|
||||
:rtype: list
|
||||
"""
|
||||
|
||||
return sorted(self._get_extension_loader_mapping())
|
||||
|
||||
def _get_loader_class(self, loader_mapping, format_name):
|
||||
try:
|
||||
format_name = format_name.lower()
|
||||
except AttributeError:
|
||||
raise TypeError("format name must be a string")
|
||||
|
||||
try:
|
||||
return loader_mapping[format_name]
|
||||
except KeyError:
|
||||
raise LoaderNotFoundError(", ".join([
|
||||
"loader not found: format='{}'".format(format_name),
|
||||
"source='{}'".format(self.source),
|
||||
]))
|
||||
|
||||
def _create_from_extension(self, extension):
|
||||
try:
|
||||
loader = self._get_loader_class(
|
||||
self._get_extension_loader_mapping(), extension)(self.source)
|
||||
loader.encoding = self._encoding
|
||||
|
||||
return loader
|
||||
except LoaderNotFoundError as e:
|
||||
raise LoaderNotFoundError("\n".join([
|
||||
"{:s} (unknown extension).".format(e.args[0]),
|
||||
"",
|
||||
"acceptable extensions are: {}.".format(
|
||||
", ".join(self.get_extension_list())),
|
||||
"actual: '{}'".format(extension)
|
||||
]))
|
||||
|
||||
def _create_from_format_name(self, format_name):
|
||||
try:
|
||||
loader = self._get_loader_class(
|
||||
self._get_format_name_loader_mapping(),
|
||||
format_name)(self.source)
|
||||
loader.encoding = self._encoding
|
||||
|
||||
return loader
|
||||
except LoaderNotFoundError as e:
|
||||
raise LoaderNotFoundError("\n".join([
|
||||
"{:s} (unknown format name).".format(e.args[0]),
|
||||
"acceptable format names are: {}.".format(
|
||||
", ".join(self.get_format_name_list())),
|
||||
]))
|
||||
Binary file not shown.
@ -0,0 +1,143 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._common import get_extension
|
||||
from .._logger import logger
|
||||
from ..csv.core import CsvTableFileLoader
|
||||
from ..html.core import HtmlTableFileLoader
|
||||
from ..json.core import JsonTableFileLoader
|
||||
from ..ltsv.core import LtsvTableFileLoader
|
||||
from ..markdown.core import MarkdownTableFileLoader
|
||||
from ..mediawiki.core import MediaWikiTableFileLoader
|
||||
from ..spreadsheet.excelloader import ExcelTableFileLoader
|
||||
from ..sqlite.core import SqliteFileLoader
|
||||
from ..tsv.core import TsvTableFileLoader
|
||||
from ._base import BaseTableLoaderFactory
|
||||
|
||||
|
||||
class TableFileLoaderFactory(BaseTableLoaderFactory):
|
||||
"""
|
||||
:param str file_path: Path to the loading file.
|
||||
:raises pytablereader.InvalidFilePathError:
|
||||
If the ``file_path`` is an empty path.
|
||||
"""
|
||||
|
||||
@property
|
||||
def file_extension(self):
|
||||
"""
|
||||
:return: File extension of the :py:attr:`.source` (without period).
|
||||
:rtype: str
|
||||
"""
|
||||
|
||||
return get_extension(self.source)
|
||||
|
||||
def create_from_path(self):
|
||||
"""
|
||||
Create a file loader from the file extension to loading file.
|
||||
Supported file extensions are as follows:
|
||||
|
||||
========================== =====================================
|
||||
Format name Loader
|
||||
========================== =====================================
|
||||
``"csv"`` :py:class:`~.CsvTableFileLoader`
|
||||
``"xls"``/``"xlsx"`` :py:class:`~.ExcelTableFileLoader`
|
||||
``"htm"``/``"html"`` :py:class:`~.HtmlTableFileLoader`
|
||||
``"json"`` :py:class:`~.JsonTableFileLoader`
|
||||
``"ltsv"`` :py:class:`~.LtsvTableFileLoader`
|
||||
``"md"`` :py:class:`~.MarkdownTableFileLoader`
|
||||
``"sqlite"``/``"sqlite3"`` :py:class:`~.SqliteFileLoader`
|
||||
``"tsv"`` :py:class:`~.TsvTableFileLoader`
|
||||
========================== =====================================
|
||||
|
||||
:return:
|
||||
Loader that coincides with the file extension of the
|
||||
:py:attr:`.file_extension`.
|
||||
:raises pytablereader.LoaderNotFoundError:
|
||||
|LoaderNotFoundError_desc| loading the file.
|
||||
"""
|
||||
|
||||
logger.debug(
|
||||
"TableFileLoaderFactory.create_from_path: extension={}".format(
|
||||
self.file_extension))
|
||||
|
||||
return self._create_from_extension(self.file_extension)
|
||||
|
||||
def create_from_format_name(self, format_name):
|
||||
"""
|
||||
Create a file loader from a format name.
|
||||
Supported file formats are as follows:
|
||||
|
||||
=============== ======================================
|
||||
Format name Loader
|
||||
=============== ======================================
|
||||
``"csv"`` :py:class:`~.CsvTableFileLoader`
|
||||
``"excel"`` :py:class:`~.ExcelTableFileLoader`
|
||||
``"html"`` :py:class:`~.HtmlTableFileLoader`
|
||||
``"json"`` :py:class:`~.JsonTableFileLoader`
|
||||
``"ltsv"`` :py:class:`~.LtsvTableFileLoader`
|
||||
``"markdown"`` :py:class:`~.MarkdownTableFileLoader`
|
||||
``"mediawiki"`` :py:class:`~.MediaWikiTableFileLoader`
|
||||
``"sqlite"`` :py:class:`~.SqliteFileLoader`
|
||||
``"tsv"`` :py:class:`~.TsvTableFileLoader`
|
||||
=============== ======================================
|
||||
|
||||
:param str format_name: Format name string (case insensitive).
|
||||
:return: Loader that coincides with the ``format_name``:
|
||||
:raises pytablereader.LoaderNotFoundError:
|
||||
|LoaderNotFoundError_desc| the format.
|
||||
"""
|
||||
|
||||
logger.debug(
|
||||
"TableFileLoaderFactory.create_from_format_name: name={}".format(
|
||||
self.file_extension))
|
||||
|
||||
return self._create_from_format_name(format_name)
|
||||
|
||||
@staticmethod
|
||||
def _get_common_loader_mapping():
|
||||
return {
|
||||
"csv": CsvTableFileLoader,
|
||||
"html": HtmlTableFileLoader,
|
||||
"json": JsonTableFileLoader,
|
||||
"ltsv": LtsvTableFileLoader,
|
||||
"sqlite": SqliteFileLoader,
|
||||
"tsv": TsvTableFileLoader,
|
||||
}
|
||||
|
||||
def _get_extension_loader_mapping(self):
|
||||
"""
|
||||
:return: Mappings of format extension and loader class.
|
||||
:rtype: dict
|
||||
"""
|
||||
|
||||
loader_table = self._get_common_loader_mapping()
|
||||
loader_table.update({
|
||||
"htm": HtmlTableFileLoader,
|
||||
"md": MarkdownTableFileLoader,
|
||||
"sqlite3": SqliteFileLoader,
|
||||
"xlsx": ExcelTableFileLoader,
|
||||
"xls": ExcelTableFileLoader,
|
||||
})
|
||||
|
||||
return loader_table
|
||||
|
||||
def _get_format_name_loader_mapping(self):
|
||||
"""
|
||||
:return: Mappings of format name and loader class.
|
||||
:rtype: dict
|
||||
"""
|
||||
|
||||
loader_table = self._get_common_loader_mapping()
|
||||
loader_table.update({
|
||||
"excel": ExcelTableFileLoader,
|
||||
"markdown": MarkdownTableFileLoader,
|
||||
"mediawiki": MediaWikiTableFileLoader,
|
||||
})
|
||||
|
||||
return loader_table
|
||||
Binary file not shown.
@ -0,0 +1,224 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import requests
|
||||
import typepy
|
||||
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
from .._common import (
|
||||
get_extension,
|
||||
make_temp_file_path_from_url,
|
||||
)
|
||||
from .._constant import SourceType
|
||||
from .._logger import logger
|
||||
from .._validator import UrlValidator
|
||||
from ..csv.core import CsvTableTextLoader
|
||||
from ..error import (
|
||||
InvalidFilePathError,
|
||||
InvalidUrlError,
|
||||
HTTPError,
|
||||
ProxyError,
|
||||
)
|
||||
from ..html.core import HtmlTableTextLoader
|
||||
from ..json.core import JsonTableTextLoader
|
||||
from ..ltsv.core import LtsvTableTextLoader
|
||||
from ..markdown.core import MarkdownTableTextLoader
|
||||
from ..mediawiki.core import MediaWikiTableTextLoader
|
||||
from ..spreadsheet.excelloader import ExcelTableFileLoader
|
||||
from ..sqlite.core import SqliteFileLoader
|
||||
from ..tsv.core import TsvTableTextLoader
|
||||
from ._base import BaseTableLoaderFactory
|
||||
|
||||
|
||||
class TableUrlLoaderFactory(BaseTableLoaderFactory):
|
||||
|
||||
def __init__(self, url, encoding=None, proxies=None):
|
||||
super(TableUrlLoaderFactory, self).__init__(None)
|
||||
|
||||
self.__url = url
|
||||
self.__proxies = proxies
|
||||
self.__temp_dir_path = None
|
||||
|
||||
self._encoding = encoding
|
||||
|
||||
UrlValidator(url).validate()
|
||||
|
||||
def __del__(self):
|
||||
if typepy.is_null_string(self.__temp_dir_path):
|
||||
return
|
||||
|
||||
os.removedirs(self.__temp_dir_path)
|
||||
self.__temp_dir_path = None
|
||||
|
||||
def create_from_path(self):
|
||||
"""
|
||||
Create a file loader from the file extension to loading file.
|
||||
Supported file extensions are as follows:
|
||||
|
||||
========================================= =====================================
|
||||
Format name Loader
|
||||
========================================= =====================================
|
||||
``"csv"`` :py:class:`~.CsvTableTextLoader`
|
||||
``"xls"``/``"xlsx"`` :py:class:`~.ExcelTableFileLoader`
|
||||
``"htm"``/``"html"``/``"asp"``/``"aspx"`` :py:class:`~.HtmlTableTextLoader`
|
||||
``"json"`` :py:class:`~.JsonTableTextLoader`
|
||||
``"ltsv"`` :py:class:`~.LtsvTableTextLoader`
|
||||
``"md"`` :py:class:`~.MarkdownTableTextLoader`
|
||||
``"sqlite"``/``"sqlite3"`` :py:class:`~.SqliteFileLoader`
|
||||
``"tsv"`` :py:class:`~.TsvTableTextLoader`
|
||||
========================================= =====================================
|
||||
|
||||
:return:
|
||||
Loader that coincides with the file extension of the URL.
|
||||
:raises pytablereader.InvalidUrlError: If unacceptable URL format.
|
||||
:raises pytablereader.LoaderNotFoundError:
|
||||
|LoaderNotFoundError_desc| loading the URL.
|
||||
"""
|
||||
|
||||
url_path = urlparse(self.__url).path
|
||||
try:
|
||||
url_extension = get_extension(url_path.rstrip("/"))
|
||||
except InvalidFilePathError:
|
||||
raise InvalidUrlError("url must include path")
|
||||
|
||||
logger.debug(
|
||||
"TableUrlLoaderFactory.create_from_path: extension={}".format(
|
||||
url_extension))
|
||||
|
||||
loader_class = self._get_loader_class(
|
||||
self._get_extension_loader_mapping(), url_extension)
|
||||
|
||||
try:
|
||||
self._fetch_source(loader_class)
|
||||
except requests.exceptions.ProxyError as e:
|
||||
raise ProxyError(e)
|
||||
|
||||
return self._create_from_extension(url_extension)
|
||||
|
||||
def create_from_format_name(self, format_name):
|
||||
"""
|
||||
Create a file loader from a format name.
|
||||
Supported file formats are as follows:
|
||||
|
||||
========================== ======================================
|
||||
Format name Loader
|
||||
========================== ======================================
|
||||
``"csv"`` :py:class:`~.CsvTableTextLoader`
|
||||
``"excel"`` :py:class:`~.ExcelTableFileLoader`
|
||||
``"html"`` :py:class:`~.HtmlTableTextLoader`
|
||||
``"json"`` :py:class:`~.JsonTableTextLoader`
|
||||
``"ltsv"`` :py:class:`~.LtsvTableTextLoader`
|
||||
``"markdown"`` :py:class:`~.MarkdownTableTextLoader`
|
||||
``"mediawiki"`` :py:class:`~.MediaWikiTableTextLoader`
|
||||
``"sqlite"`` :py:class:`~.SqliteFileLoader`
|
||||
``"tsv"`` :py:class:`~.TsvTableTextLoader`
|
||||
========================== ======================================
|
||||
|
||||
:param str format_name: Format name string (case insensitive).
|
||||
:return: Loader that coincide with the ``format_name``:
|
||||
:raises pytablereader.LoaderNotFoundError:
|
||||
|LoaderNotFoundError_desc| the format.
|
||||
:raises TypeError: If ``format_name`` is not a string.
|
||||
"""
|
||||
|
||||
logger.debug(
|
||||
"TableUrlLoaderFactory.create_from_format_name: name={}".format(
|
||||
format_name))
|
||||
|
||||
loader_class = self._get_loader_class(
|
||||
self._get_format_name_loader_mapping(), format_name)
|
||||
|
||||
try:
|
||||
self._fetch_source(loader_class)
|
||||
except requests.exceptions.ProxyError as e:
|
||||
raise ProxyError(e)
|
||||
|
||||
return self._create_from_format_name(format_name)
|
||||
|
||||
def _fetch_source(self, loader_class):
|
||||
loader_source_type = loader_class("").source_type
|
||||
|
||||
if loader_source_type not in [SourceType.TEXT, SourceType.FILE]:
|
||||
raise ValueError(
|
||||
"unknown loader source: type={}".format(loader_source_type))
|
||||
|
||||
r = requests.get(self.__url, proxies=self.__proxies)
|
||||
|
||||
try:
|
||||
r.raise_for_status()
|
||||
except requests.HTTPError as e:
|
||||
raise HTTPError(e)
|
||||
|
||||
if typepy.is_null_string(self._encoding):
|
||||
self._encoding = r.encoding
|
||||
|
||||
logger.debug("\n".join([
|
||||
"_fetch_source: ",
|
||||
" source-type={}".format(loader_source_type),
|
||||
" content-type={}".format(r.headers["Content-Type"]),
|
||||
" encoding={}".format(self._encoding),
|
||||
" status-code={}".format(r.status_code),
|
||||
]))
|
||||
|
||||
if loader_source_type == SourceType.TEXT:
|
||||
self._source = r.text
|
||||
elif loader_source_type == SourceType.FILE:
|
||||
self.__temp_dir_path = tempfile.mkdtemp()
|
||||
self._source = "{:s}.xlsx".format(
|
||||
make_temp_file_path_from_url(self.__temp_dir_path, self.__url))
|
||||
with open(self._source, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
def _get_common_loader_mapping(self):
|
||||
return {
|
||||
"csv": CsvTableTextLoader,
|
||||
"html": HtmlTableTextLoader,
|
||||
"json": JsonTableTextLoader,
|
||||
"ltsv": LtsvTableTextLoader,
|
||||
"sqlite": SqliteFileLoader,
|
||||
"tsv": TsvTableTextLoader,
|
||||
}
|
||||
|
||||
def _get_extension_loader_mapping(self):
|
||||
"""
|
||||
:return: Mappings of format-extension and loader class.
|
||||
:rtype: dict
|
||||
"""
|
||||
|
||||
loader_table = self._get_common_loader_mapping()
|
||||
loader_table.update({
|
||||
"asp": HtmlTableTextLoader,
|
||||
"aspx": HtmlTableTextLoader,
|
||||
"htm": HtmlTableTextLoader,
|
||||
"md": MarkdownTableTextLoader,
|
||||
"sqlite3": SqliteFileLoader,
|
||||
"xls": ExcelTableFileLoader,
|
||||
"xlsx": ExcelTableFileLoader,
|
||||
})
|
||||
|
||||
return loader_table
|
||||
|
||||
def _get_format_name_loader_mapping(self):
|
||||
"""
|
||||
:return: Mappings of format-name and loader class.
|
||||
:rtype: dict
|
||||
"""
|
||||
|
||||
loader_table = self._get_common_loader_mapping()
|
||||
loader_table.update({
|
||||
"excel": ExcelTableFileLoader,
|
||||
"markdown": MarkdownTableTextLoader,
|
||||
"mediawiki": MediaWikiTableTextLoader,
|
||||
})
|
||||
|
||||
return loader_table
|
||||
Binary file not shown.
@ -0,0 +1,40 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import abc
|
||||
|
||||
from pytablereader import InvalidDataError
|
||||
import six
|
||||
|
||||
from ._acceptor import LoaderAcceptor
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class TableFormatterInterface(object):
|
||||
"""
|
||||
The abstract class of table data validator.
|
||||
"""
|
||||
|
||||
@abc.abstractmethod
|
||||
def to_table_data(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class TableFormatter(LoaderAcceptor, TableFormatterInterface):
|
||||
"""
|
||||
The abstract class of |TableData| formatter.
|
||||
"""
|
||||
|
||||
def _validate_source_data(self):
|
||||
if not self._source_data:
|
||||
raise InvalidDataError("source data is empty")
|
||||
|
||||
def __init__(self, source_data):
|
||||
self._source_data = source_data
|
||||
|
||||
self._validate_source_data()
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,159 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
|
||||
from .._constant import (
|
||||
Default,
|
||||
TableNameTemplate as tnt,
|
||||
)
|
||||
from .._logger import (
|
||||
FileSourceLogger,
|
||||
TextSourceLogger,
|
||||
)
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..interface import TableLoader
|
||||
from .formatter import HtmlTableFormatter
|
||||
|
||||
|
||||
class HtmlTableLoader(TableLoader):
|
||||
"""
|
||||
An abstract class of HTML table loaders.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "html"
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}_{:s}".format(tnt.TITLE, tnt.KEY)
|
||||
|
||||
|
||||
class HtmlTableFileLoader(HtmlTableLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from HTML files.
|
||||
|
||||
:param str file_path: Path to the loading HTML file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(title)s_%(key)s``.
|
||||
|
||||
.. py:attribute:: encoding
|
||||
|
||||
HTML file encoding. Defaults to ``"utf-8"``.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(HtmlTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self.encoding = Default.ENCODING
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from HTML table tags in
|
||||
a HTML file.
|
||||
|load_source_desc_file|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(title)s`` ``<title>`` tag value of the HTML.
|
||||
``%(key)s`` | This replaced to:
|
||||
| **(1)** ``id`` attribute of the table tag
|
||||
| **(2)** ``%(format_name)s%(format_id)s``
|
||||
| if ``id`` attribute not present in the
|
||||
| table tag.
|
||||
``%(format_name)s`` ``"html"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the HTML data is invalid or empty.
|
||||
|
||||
.. note::
|
||||
|
||||
Table tag attributes ignored with loaded |TableData|.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
with io.open(self.source, "r", encoding=self.encoding) as fp:
|
||||
formatter = HtmlTableFormatter(fp.read())
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
|
||||
class HtmlTableTextLoader(HtmlTableLoader):
|
||||
"""
|
||||
A text loader class to extract tabular data from HTML text data.
|
||||
|
||||
:param str text: HTML text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(title)s_%(key)s``.
|
||||
"""
|
||||
|
||||
def __init__(self, text):
|
||||
super(HtmlTableTextLoader, self).__init__(text)
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
self._logger = TextSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from HTML table tags in
|
||||
a HTML text object.
|
||||
|load_source_desc_text|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` ``""``
|
||||
``%(title)s`` ``<title>`` tag value of the HTML.
|
||||
``%(key)s`` | This replaced to:
|
||||
| **(1)** ``id`` attribute of the table tag
|
||||
| **(2)** ``%(format_name)s%(format_id)s``
|
||||
| if ``id`` attribute is not included
|
||||
| in the table tag.
|
||||
``%(format_name)s`` ``"html"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the HTML data is invalid or empty.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
formatter = HtmlTableFormatter(self.source)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
Binary file not shown.
@ -0,0 +1,115 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
import bs4
|
||||
from pytablereader import InvalidDataError
|
||||
from tabledata import TableData
|
||||
import typepy
|
||||
|
||||
from .._constant import TableNameTemplate as tnt
|
||||
from ..formatter import TableFormatter
|
||||
|
||||
|
||||
class HtmlTableFormatter(TableFormatter):
|
||||
|
||||
@property
|
||||
def table_id(self):
|
||||
return self.__table_id
|
||||
|
||||
def __init__(self, source_data):
|
||||
super(HtmlTableFormatter, self).__init__(source_data)
|
||||
|
||||
self.__table_id = None
|
||||
|
||||
if typepy.is_null_string(source_data):
|
||||
raise InvalidDataError
|
||||
|
||||
try:
|
||||
self.__soup = bs4.BeautifulSoup(self._source_data, "lxml")
|
||||
except bs4.FeatureNotFound:
|
||||
self.__soup = bs4.BeautifulSoup(self._source_data, "html.parser")
|
||||
|
||||
def to_table_data(self):
|
||||
for table in self.__soup.find_all("table"):
|
||||
try:
|
||||
table_data = self.__parse_html(table)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
if table_data.is_empty_record():
|
||||
continue
|
||||
|
||||
yield table_data
|
||||
|
||||
def _make_table_name(self):
|
||||
from collections import OrderedDict
|
||||
|
||||
key = self.table_id
|
||||
if typepy.is_null_string(key):
|
||||
key = self._loader.get_format_key()
|
||||
|
||||
try:
|
||||
title = self.__soup.title.text
|
||||
except AttributeError:
|
||||
title = ""
|
||||
|
||||
kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
|
||||
kv_mapping.update(OrderedDict([
|
||||
(tnt.KEY, key),
|
||||
(tnt.TITLE, title),
|
||||
]))
|
||||
|
||||
return self._loader._expand_table_name_format(kv_mapping)
|
||||
|
||||
def __parse_tag_id(self, table):
|
||||
self.__table_id = table.get("id")
|
||||
|
||||
if self.__table_id is None:
|
||||
caption = table.find("caption")
|
||||
if caption is not None:
|
||||
caption = caption.text.strip()
|
||||
if typepy.is_not_null_string(caption):
|
||||
self.__table_id = caption
|
||||
|
||||
def __parse_html(self, table):
|
||||
header_list = []
|
||||
data_matrix = []
|
||||
|
||||
self.__parse_tag_id(table)
|
||||
|
||||
row_list = table.find_all("tr")
|
||||
re_table_val = re.compile("td|th")
|
||||
for row in row_list:
|
||||
td_list = row.find_all("td")
|
||||
if typepy.is_empty_sequence(td_list):
|
||||
if typepy.is_not_empty_sequence(header_list):
|
||||
continue
|
||||
|
||||
th_list = row.find_all("th")
|
||||
if typepy.is_empty_sequence(th_list):
|
||||
continue
|
||||
|
||||
header_list = [row.text.strip() for row in th_list]
|
||||
continue
|
||||
|
||||
data_matrix.append([
|
||||
value.get_text().strip()
|
||||
for value in row.find_all(re_table_val)
|
||||
])
|
||||
|
||||
if typepy.is_empty_sequence(data_matrix):
|
||||
raise ValueError("data matrix is empty")
|
||||
|
||||
self._loader.inc_table_count()
|
||||
|
||||
return TableData(
|
||||
self._make_table_name(), header_list, data_matrix,
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
Binary file not shown.
@ -0,0 +1,158 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
import threading
|
||||
|
||||
import path
|
||||
from pytablereader import InvalidTableNameError
|
||||
import six
|
||||
import typepy
|
||||
|
||||
from ._constant import (
|
||||
SourceType,
|
||||
TableNameTemplate as tnt
|
||||
)
|
||||
|
||||
|
||||
@six.add_metaclass(abc.ABCMeta)
|
||||
class TableLoaderInterface(object):
|
||||
"""
|
||||
Interface class of table loader class.
|
||||
"""
|
||||
|
||||
@abc.abstractproperty
|
||||
def format_name(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
def source_type(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def load(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def inc_table_count(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
|
||||
class TableLoader(TableLoaderInterface):
|
||||
"""
|
||||
The abstract class of table data file loader.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string.
|
||||
|
||||
.. py:attribute:: source
|
||||
|
||||
Table data source to load.
|
||||
"""
|
||||
|
||||
__table_count_lock = threading.Lock()
|
||||
__global_table_count = 0
|
||||
__format_table_count = {}
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return self._validator.source_type
|
||||
|
||||
def __init__(self, source):
|
||||
self.table_name = tnt.DEFAULT
|
||||
self.source = source
|
||||
self.quoting_flags = None
|
||||
self._validator = None
|
||||
self._logger = None
|
||||
|
||||
def get_format_key(self):
|
||||
return "{:s}{:d}".format(
|
||||
self.format_name,
|
||||
self.__get_format_table_count())
|
||||
|
||||
def make_table_name(self):
|
||||
return self._make_table_name()
|
||||
|
||||
def inc_table_count(self):
|
||||
with self.__table_count_lock:
|
||||
self.__global_table_count += 1
|
||||
self.__format_table_count[self.format_name] = (
|
||||
self.__get_format_table_count() + 1)
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get_default_table_name_template(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
def _validate(self):
|
||||
self._validate_table_name()
|
||||
self._validate_source()
|
||||
|
||||
def _validate_table_name(self):
|
||||
try:
|
||||
if typepy.is_null_string(self.table_name):
|
||||
raise ValueError("table name is empty")
|
||||
except (TypeError, AttributeError):
|
||||
raise TypeError("table_name must be a string")
|
||||
|
||||
def _validate_source(self):
|
||||
self._validator.validate()
|
||||
|
||||
def __get_format_table_count(self):
|
||||
return self.__format_table_count.get(self.format_name, 0)
|
||||
|
||||
def _get_filename_tablename_mapping(self):
|
||||
filename = ""
|
||||
if all([
|
||||
self.source_type == SourceType.FILE,
|
||||
typepy.is_not_null_string(self.source),
|
||||
]):
|
||||
filename = path.Path(self.source).namebase
|
||||
|
||||
return (tnt.FILENAME, filename)
|
||||
|
||||
def _get_basic_tablename_keyvalue_mapping(self):
|
||||
from collections import OrderedDict
|
||||
|
||||
return OrderedDict([
|
||||
(tnt.DEFAULT, self._get_default_table_name_template()),
|
||||
(tnt.FORMAT_NAME, self.format_name),
|
||||
(tnt.FORMAT_ID, str(self.__get_format_table_count())),
|
||||
(tnt.GLOBAL_ID, str(self.__global_table_count)),
|
||||
self._get_filename_tablename_mapping(),
|
||||
])
|
||||
|
||||
def _expand_table_name_format(self, table_name_kv_mapping):
|
||||
self._validate_table_name()
|
||||
|
||||
table_name = self.table_name
|
||||
for teamplate, value in six.iteritems(table_name_kv_mapping):
|
||||
table_name = table_name.replace(teamplate, value)
|
||||
|
||||
return self._sanitize_table_name(table_name)
|
||||
|
||||
def _make_table_name(self):
|
||||
self._validate_table_name()
|
||||
|
||||
return self._expand_table_name_format(
|
||||
self._get_basic_tablename_keyvalue_mapping())
|
||||
|
||||
@staticmethod
|
||||
def _sanitize_table_name(table_name):
|
||||
if typepy.is_null_string(table_name):
|
||||
raise InvalidTableNameError(
|
||||
"table name is empty after the template replacement")
|
||||
|
||||
return table_name.strip("_")
|
||||
|
||||
@classmethod
|
||||
def clear_table_count(cls):
|
||||
with cls.__table_count_lock:
|
||||
cls.__global_table_count = 0
|
||||
cls.__format_table_count = {}
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,301 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
import json
|
||||
|
||||
from .._constant import (
|
||||
Default,
|
||||
SourceType,
|
||||
TableNameTemplate as tnt
|
||||
)
|
||||
from .._logger import (
|
||||
FileSourceLogger,
|
||||
TextSourceLogger,
|
||||
)
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..error import ValidationError
|
||||
from ..interface import TableLoader
|
||||
from .formatter import JsonTableFormatter
|
||||
|
||||
|
||||
class JsonTableLoader(TableLoader):
|
||||
"""
|
||||
An abstract class of JSON table loaders.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "json"
|
||||
|
||||
|
||||
class JsonTableFileLoader(JsonTableLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from JSON files.
|
||||
|
||||
:param str file_path: Path to the loading JSON file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s_%(key)s``.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(JsonTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self.encoding = Default.ENCODING
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a JSON file.
|
||||
|load_source_desc_file|
|
||||
|
||||
This method can be loading four types of JSON formats:
|
||||
**(1)** Single table data in a file,
|
||||
acceptable JSON Schema is as follows:
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON Schema (1): accept single table
|
||||
|
||||
{
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "number"},
|
||||
{"type": "null"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON example for the JSON schema (1)
|
||||
|
||||
[
|
||||
{"attr_b": 4, "attr_c": "a", "attr_a": 1},
|
||||
{"attr_b": 2.1, "attr_c": "bb", "attr_a": 2},
|
||||
{"attr_b": 120.9, "attr_c": "ccc", "attr_a": 3}
|
||||
]
|
||||
|
||||
**(2)** Single table data in a file,
|
||||
acceptable JSON Schema is as follows:
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON Schema (2): accept single table
|
||||
|
||||
{
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "number"},
|
||||
{"type": "null"},
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON example for the JSON schema (2)
|
||||
|
||||
{
|
||||
"attr_a": [1, 2, 3],
|
||||
"attr_b": [4, 2.1, 120.9],
|
||||
"attr_c": ["a", "bb", "ccc"]
|
||||
}
|
||||
|
||||
**(3)** Multiple table data in a file,
|
||||
acceptable JSON Schema is as follows:
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON Schema (3): accept multiple table
|
||||
|
||||
{
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "number"},
|
||||
{"type": "null"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON example for the JSON schema (3)
|
||||
|
||||
{
|
||||
"table_a" : [
|
||||
{"attr_b": 4, "attr_c": "a", "attr_a": 1},
|
||||
{"attr_b": 2.1, "attr_c": "bb", "attr_a": 2},
|
||||
{"attr_b": 120.9, "attr_c": "ccc", "attr_a": 3}
|
||||
],
|
||||
"table_b" : [
|
||||
{"a": 1, "b": 4},
|
||||
{"a": 2 },
|
||||
{"a": 3, "b": 120.9}
|
||||
]
|
||||
}
|
||||
|
||||
**(4)** Multiple table data in a file,
|
||||
acceptable JSON Schema is as follows:
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON Schema (4): accept multiple table
|
||||
|
||||
{
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "number"},
|
||||
{"type": "null"},
|
||||
],
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
.. code-block:: json
|
||||
:caption: JSON example for the JSON schema (4)
|
||||
|
||||
{
|
||||
"table_a" : {
|
||||
"attr_a": [1, 2, 3],
|
||||
"attr_b": [4, 2.1, 120.9],
|
||||
"attr_c": ["a", "bb", "ccc"]
|
||||
},
|
||||
"table_b" : {
|
||||
"a": [1, 3],
|
||||
"b": [4, 120.9]
|
||||
}
|
||||
}
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(key)s`` | This replaced the different value
|
||||
| for each single/multiple JSON tables:
|
||||
| [single JSON table]
|
||||
| ``%(format_name)s%(format_id)s``
|
||||
| [multiple JSON table] Table data key.
|
||||
``%(format_name)s`` ``"json"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the data is invalid JSON.
|
||||
:raises pytablereader.error.ValidationError:
|
||||
If the data is not acceptable JSON format.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
with io.open(self.source, "r", encoding=self.encoding) as fp:
|
||||
try:
|
||||
json_buffer = json.load(fp)
|
||||
except ValueError as e:
|
||||
raise ValidationError(e)
|
||||
|
||||
formatter = JsonTableFormatter(json_buffer)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}_{:s}".format(tnt.FILENAME, tnt.KEY)
|
||||
|
||||
|
||||
class JsonTableTextLoader(JsonTableLoader):
|
||||
"""
|
||||
A text loader class to extract tabular data from JSON text data.
|
||||
|
||||
:param str text: JSON text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(key)s``.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return SourceType.TEXT
|
||||
|
||||
def __init__(self, text):
|
||||
super(JsonTableTextLoader, self).__init__(text)
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
self._logger = TextSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a JSON text object.
|
||||
|load_source_desc_text|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` ``""``
|
||||
``%(key)s`` | This replaced the different value
|
||||
| for each single/multiple JSON tables:
|
||||
| [single JSON table]
|
||||
| ``%(format_name)s%(format_id)s``
|
||||
| [multiple JSON table] Table data key.
|
||||
``%(format_name)s`` ``"json"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
|
||||
.. seealso::
|
||||
|
||||
:py:meth:`.JsonTableFileLoader.load()`
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
json_buffer = json.loads(self.source)
|
||||
|
||||
formatter = JsonTableFormatter(json_buffer)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}".format(tnt.KEY)
|
||||
Binary file not shown.
@ -0,0 +1,260 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
|
||||
import jsonschema
|
||||
import six
|
||||
from tabledata import TableData
|
||||
|
||||
from six.moves import zip
|
||||
|
||||
from .._constant import (
|
||||
SourceType,
|
||||
TableNameTemplate as tnt,
|
||||
)
|
||||
from ..error import ValidationError
|
||||
from ..formatter import TableFormatter
|
||||
|
||||
|
||||
class JsonConverter(TableFormatter):
|
||||
"""
|
||||
The abstract class of JSON data converter.
|
||||
"""
|
||||
|
||||
_VALUE_TYPE_SCHEMA = {
|
||||
"anyOf": [
|
||||
{"type": "string"},
|
||||
{"type": "number"},
|
||||
{"type": "null"},
|
||||
],
|
||||
}
|
||||
|
||||
def __init__(self, json_buffer):
|
||||
self._buffer = json_buffer
|
||||
|
||||
@abc.abstractproperty
|
||||
def _schema(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
def _validate_source_data(self):
|
||||
"""
|
||||
:raises ValidationError:
|
||||
"""
|
||||
|
||||
try:
|
||||
jsonschema.validate(self._buffer, self._schema)
|
||||
except jsonschema.ValidationError as e:
|
||||
raise ValidationError(e)
|
||||
|
||||
|
||||
class SingleJsonTableConverterBase(JsonConverter):
|
||||
|
||||
def _make_table_name(self):
|
||||
kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
|
||||
kv_mapping[tnt.KEY] = self._loader.get_format_key()
|
||||
|
||||
if self._loader.source_type == SourceType.FILE:
|
||||
kv_mapping[tnt.DEFAULT] = tnt.FILENAME
|
||||
elif self._loader.source_type == SourceType.TEXT:
|
||||
kv_mapping[tnt.DEFAULT] = tnt.KEY
|
||||
|
||||
return self._loader._expand_table_name_format(kv_mapping)
|
||||
|
||||
|
||||
class SingleJsonTableConverterA(SingleJsonTableConverterBase):
|
||||
"""
|
||||
A concrete class of JSON table data formatter.
|
||||
"""
|
||||
|
||||
@property
|
||||
def _schema(self):
|
||||
return {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": self._VALUE_TYPE_SCHEMA,
|
||||
},
|
||||
}
|
||||
|
||||
def to_table_data(self):
|
||||
"""
|
||||
:raises ValueError:
|
||||
:raises pytablereader.error.ValidationError:
|
||||
"""
|
||||
|
||||
self._validate_source_data()
|
||||
|
||||
attr_name_set = set()
|
||||
for json_record in self._buffer:
|
||||
attr_name_set = attr_name_set.union(six.viewkeys(json_record))
|
||||
|
||||
self._loader.inc_table_count()
|
||||
|
||||
yield TableData(
|
||||
table_name=self._make_table_name(),
|
||||
header_list=sorted(attr_name_set),
|
||||
record_list=self._buffer,
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
|
||||
|
||||
class SingleJsonTableConverterB(SingleJsonTableConverterBase):
|
||||
"""
|
||||
A concrete class of JSON table data formatter.
|
||||
"""
|
||||
|
||||
@property
|
||||
def _schema(self):
|
||||
return {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "array",
|
||||
"items": self._VALUE_TYPE_SCHEMA,
|
||||
},
|
||||
}
|
||||
|
||||
def to_table_data(self):
|
||||
"""
|
||||
:raises ValueError:
|
||||
:raises pytablereader.error.ValidationError:
|
||||
"""
|
||||
|
||||
self._validate_source_data()
|
||||
self._loader.inc_table_count()
|
||||
|
||||
header_list = sorted(six.viewkeys(self._buffer))
|
||||
|
||||
yield TableData(
|
||||
table_name=self._make_table_name(),
|
||||
header_list=header_list,
|
||||
record_list=zip(
|
||||
*[self._buffer.get(header) for header in header_list]),
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
|
||||
|
||||
class MultipleJsonTableConverterBase(JsonConverter):
|
||||
|
||||
def __init__(self, json_buffer):
|
||||
super(MultipleJsonTableConverterBase, self).__init__(json_buffer)
|
||||
|
||||
self._table_key = None
|
||||
|
||||
def _make_table_name(self):
|
||||
kv_mapping = self._loader._get_basic_tablename_keyvalue_mapping()
|
||||
kv_mapping[tnt.DEFAULT] = tnt.KEY
|
||||
kv_mapping[tnt.KEY] = self._table_key
|
||||
|
||||
return self._loader._expand_table_name_format(kv_mapping)
|
||||
|
||||
|
||||
class MultipleJsonTableConverterA(MultipleJsonTableConverterBase):
|
||||
"""
|
||||
A concrete class of JSON table data converter.
|
||||
"""
|
||||
|
||||
@property
|
||||
def _schema(self):
|
||||
return {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"additionalProperties": self._VALUE_TYPE_SCHEMA,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def to_table_data(self):
|
||||
"""
|
||||
:raises ValueError:
|
||||
:raises pytablereader.error.ValidationError:
|
||||
"""
|
||||
|
||||
self._validate_source_data()
|
||||
|
||||
for table_key, json_record_list in six.iteritems(self._buffer):
|
||||
attr_name_set = set()
|
||||
for json_record in json_record_list:
|
||||
attr_name_set = attr_name_set.union(six.viewkeys(json_record))
|
||||
|
||||
self._loader.inc_table_count()
|
||||
self._table_key = table_key
|
||||
|
||||
yield TableData(
|
||||
table_name=self._make_table_name(),
|
||||
header_list=sorted(attr_name_set),
|
||||
record_list=json_record_list,
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
|
||||
|
||||
class MultipleJsonTableConverterB(MultipleJsonTableConverterBase):
|
||||
"""
|
||||
A concrete class of JSON table data converter.
|
||||
"""
|
||||
|
||||
@property
|
||||
def _schema(self):
|
||||
return {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "array",
|
||||
"items": self._VALUE_TYPE_SCHEMA,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def to_table_data(self):
|
||||
"""
|
||||
:raises ValueError:
|
||||
:raises pytablereader.error.ValidationError:
|
||||
"""
|
||||
|
||||
self._validate_source_data()
|
||||
|
||||
for table_key, json_record_list in six.iteritems(self._buffer):
|
||||
header_list = sorted(six.viewkeys(json_record_list))
|
||||
|
||||
self._loader.inc_table_count()
|
||||
self._table_key = table_key
|
||||
|
||||
yield TableData(
|
||||
table_name=self._make_table_name(),
|
||||
header_list=header_list,
|
||||
record_list=zip(
|
||||
*[json_record_list.get(header) for header in header_list]),
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
|
||||
|
||||
class JsonTableFormatter(TableFormatter):
|
||||
|
||||
def to_table_data(self):
|
||||
converter_class_list = [
|
||||
MultipleJsonTableConverterA,
|
||||
MultipleJsonTableConverterB,
|
||||
SingleJsonTableConverterA,
|
||||
SingleJsonTableConverterB,
|
||||
]
|
||||
|
||||
for converter_class in converter_class_list:
|
||||
converter = converter_class(self._source_data)
|
||||
converter.accept(self._loader)
|
||||
try:
|
||||
for table_data in converter.to_table_data():
|
||||
yield table_data
|
||||
return
|
||||
except ValidationError:
|
||||
pass
|
||||
else:
|
||||
break
|
||||
|
||||
raise ValidationError(
|
||||
"inconvertible JSON schema: json={}".format(self._source_data))
|
||||
Binary file not shown.
@ -0,0 +1,10 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from ._file import TableFileLoader
|
||||
from ._url import TableUrlLoader
|
||||
Binary file not shown.
@ -0,0 +1,40 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from ..interface import TableLoaderInterface
|
||||
|
||||
|
||||
class TableLoaderManager(TableLoaderInterface):
|
||||
|
||||
def __init__(self, loader):
|
||||
self.__loader = loader
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return self.__loader.format_name
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return self.__loader.source_type
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
try:
|
||||
return self.__loader.encoding
|
||||
except AttributeError:
|
||||
return None
|
||||
|
||||
@encoding.setter
|
||||
def encoding(self, codec_name):
|
||||
self.__loader.encoding = codec_name
|
||||
|
||||
def load(self):
|
||||
return self.__loader.load()
|
||||
|
||||
def inc_table_count(self):
|
||||
self.__loader.inc_table_count()
|
||||
Binary file not shown.
@ -0,0 +1,69 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import typepy
|
||||
from .._constant import Default
|
||||
from ..factory import TableFileLoaderFactory
|
||||
from ._base import TableLoaderManager
|
||||
|
||||
|
||||
class TableFileLoader(TableLoaderManager):
|
||||
"""
|
||||
Loader class to loading tables from a file.
|
||||
|
||||
:param str file_path: Path to the file to load.
|
||||
:param str format_name: Data format name to load.
|
||||
Supported formats are as follows:
|
||||
``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``,
|
||||
``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"tsv"``.
|
||||
If the value is |None|, automatically detect file format from
|
||||
the ``file_path``.
|
||||
:raise pytablereader.InvalidFilePathError:
|
||||
If ``file_path`` is an invalid file path.
|
||||
:raises pytablereader.LoaderNotFoundError:
|
||||
|LoaderNotFoundError_desc| loading the file.
|
||||
|
||||
.. py:method:: load
|
||||
|
||||
Loading table data from a file as ``format_name`` format.
|
||||
Automatically detect file format if ``format_name`` is |None|.
|
||||
|
||||
:return: Loaded table data iterator.
|
||||
:rtype: |TableData| iterator
|
||||
|
||||
.. seealso::
|
||||
* :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_format_name`
|
||||
* :py:meth:`pytablereader.factory.TableFileLoaderFactory.create_from_path`
|
||||
"""
|
||||
|
||||
def __init__(self, file_path, format_name=None, encoding=Default.ENCODING):
|
||||
loader_factory = TableFileLoaderFactory(file_path, encoding=encoding)
|
||||
|
||||
if typepy.is_not_null_string(format_name):
|
||||
loader = loader_factory.create_from_format_name(format_name)
|
||||
else:
|
||||
loader = loader_factory.create_from_path()
|
||||
|
||||
super(TableFileLoader, self).__init__(loader)
|
||||
|
||||
@classmethod
|
||||
def get_format_name_list(cls):
|
||||
"""
|
||||
:return:
|
||||
Available format names. These names can use by
|
||||
:py:class:`.TableFileLoader` class constructor.
|
||||
:rtype: list
|
||||
|
||||
:Example:
|
||||
.. code:: python
|
||||
|
||||
>>> pytablereader.TableFileLoader.get_format_name_list()
|
||||
['csv', 'excel', 'html', 'json', 'ltsv', 'markdown', 'mediawiki', 'sqlite', 'tsv']
|
||||
"""
|
||||
|
||||
return TableFileLoaderFactory("dummy").get_format_name_list()
|
||||
Binary file not shown.
@ -0,0 +1,76 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import typepy
|
||||
|
||||
from ..factory import TableUrlLoaderFactory
|
||||
from ._base import TableLoaderManager
|
||||
|
||||
|
||||
class TableUrlLoader(TableLoaderManager):
|
||||
"""
|
||||
Loader class to loading tables from URL.
|
||||
|
||||
:param str url: URL to load.
|
||||
:param str format_name: Data format name to load.
|
||||
Supported formats are:
|
||||
``"csv"``, ``"excel"``, ``"html"``, ``"json"``, ``"ltsv"``,
|
||||
``"markdown"``, ``"mediawiki"``, ``"sqlite"``, ``"tsv"``.
|
||||
If the value is |None|, automatically detect file format from
|
||||
the ``url``.
|
||||
:param dict proxies: http/https proxy information.
|
||||
|
||||
.. seealso::
|
||||
- `requests proxies <http://requests-docs-ja.readthedocs.io/en/latest/user/advanced/#proxies>`__
|
||||
|
||||
:raises pytablereader.LoaderNotFoundError:
|
||||
|LoaderNotFoundError_desc| loading the URL.
|
||||
:raises pytablereader.HTTPError:
|
||||
If loader received an HTTP error when access to the URL.
|
||||
|
||||
:Example:
|
||||
:ref:`example-url-table-loader`
|
||||
|
||||
.. py:method:: load
|
||||
|
||||
Load tables from URL as ``format_name`` format.
|
||||
|
||||
:return: Loaded table data iterator.
|
||||
:rtype: |TableData| iterator
|
||||
|
||||
.. seealso::
|
||||
* :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_format_name`
|
||||
* :py:meth:`pytablereader.factory.TableUrlLoaderFactory.create_from_path`
|
||||
"""
|
||||
|
||||
def __init__(self, url, format_name=None, encoding=None, proxies=None):
|
||||
loader_factory = TableUrlLoaderFactory(url, encoding, proxies)
|
||||
|
||||
if typepy.is_not_null_string(format_name):
|
||||
loader = loader_factory.create_from_format_name(format_name)
|
||||
else:
|
||||
loader = loader_factory.create_from_path()
|
||||
|
||||
super(TableUrlLoader, self).__init__(loader)
|
||||
|
||||
@classmethod
|
||||
def get_format_name_list(cls):
|
||||
"""
|
||||
:return:
|
||||
Available format names. These names can use by
|
||||
:py:class:`.TableUrlLoader` class constructor.
|
||||
:rtype: list
|
||||
|
||||
:Example:
|
||||
.. code:: python
|
||||
|
||||
>>> pytablereader.TableUrlLoaderFactory.get_format_name_list()
|
||||
['csv', 'excel', 'html', 'json', 'ltsv', 'markdown', 'mediawiki', 'sqlite', 'tsv']
|
||||
"""
|
||||
|
||||
return TableUrlLoaderFactory("http://dummy.com/").get_format_name_list()
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,209 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
|
||||
from pytablereader import (
|
||||
InvalidHeaderNameError,
|
||||
InvalidDataError
|
||||
)
|
||||
import typepy
|
||||
|
||||
import pathvalidate as pv
|
||||
|
||||
from .._constant import (
|
||||
Default,
|
||||
TableNameTemplate as tnt,
|
||||
)
|
||||
from .._logger import (
|
||||
FileSourceLogger,
|
||||
TextSourceLogger,
|
||||
)
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..interface import TableLoader
|
||||
from ..json.formatter import SingleJsonTableConverterA
|
||||
|
||||
|
||||
class LtsvTableLoader(TableLoader):
|
||||
"""
|
||||
Abstract class of
|
||||
`Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
|
||||
format table loaders.
|
||||
|
||||
.. py:attribute:: encoding
|
||||
|
||||
Encoding of the LTSV data.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "ltsv"
|
||||
|
||||
def __init__(self, source):
|
||||
super(LtsvTableLoader, self).__init__(source)
|
||||
|
||||
self._ltsv_input_stream = None
|
||||
|
||||
def _to_data_matrix(self):
|
||||
from collections import OrderedDict
|
||||
|
||||
data_matrix = []
|
||||
|
||||
for row_idx, row in enumerate(self._ltsv_input_stream):
|
||||
if typepy.is_empty_sequence(row):
|
||||
continue
|
||||
|
||||
ltsv_record = OrderedDict()
|
||||
for col_idx, ltsv_item in enumerate(row.strip().split("\t")):
|
||||
try:
|
||||
label, value = ltsv_item.split(":")
|
||||
except ValueError:
|
||||
raise InvalidDataError(
|
||||
"invalid lstv item found: line={}, col={}, item='{}'".format(
|
||||
row_idx, col_idx, ltsv_item))
|
||||
|
||||
label = label.strip('"')
|
||||
|
||||
try:
|
||||
pv.validate_ltsv_label(label)
|
||||
except (pv.NullNameError, pv.InvalidCharError):
|
||||
raise InvalidHeaderNameError(
|
||||
"invalid label found (acceptable chars are [0-9A-Za-z_.-]): "
|
||||
"line={}, col={}, label='{}'".format(
|
||||
row_idx, col_idx, label))
|
||||
|
||||
ltsv_record[label] = value
|
||||
|
||||
data_matrix.append(ltsv_record)
|
||||
|
||||
# using generator to prepare for future enhancement to support
|
||||
# iterative load.
|
||||
yield data_matrix
|
||||
|
||||
|
||||
class LtsvTableFileLoader(LtsvTableLoader):
|
||||
"""
|
||||
`Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
|
||||
format file loader class.
|
||||
|
||||
:param str file_path: Path to the loading LTSV file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s``.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path):
|
||||
super(LtsvTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self.encoding = Default.ENCODING
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
self.__file = None
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a LTSV file.
|
||||
|load_source_desc_file|
|
||||
|
||||
:return:
|
||||
Loaded table data.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ========================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ========================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(format_name)s`` ``"ltsv"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ========================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidHeaderNameError:
|
||||
If an invalid label name is included in the LTSV file.
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the LTSV data is invalid.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
self._ltsv_input_stream = io.open(
|
||||
self.source, "r", encoding=self.encoding)
|
||||
|
||||
for data_matrix in self._to_data_matrix():
|
||||
formatter = SingleJsonTableConverterA(data_matrix)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return tnt.FILENAME
|
||||
|
||||
|
||||
class LtsvTableTextLoader(LtsvTableLoader):
|
||||
"""
|
||||
`Labeled Tab-separated Values (LTSV) <http://ltsv.org/>`__
|
||||
format text loader class.
|
||||
|
||||
:param str text: LTSV text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(format_name)s%(format_id)s``.
|
||||
"""
|
||||
|
||||
def __init__(self, text):
|
||||
super(LtsvTableTextLoader, self).__init__(text)
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
self._logger = TextSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a LTSV text object.
|
||||
|load_source_desc_text|
|
||||
|
||||
:return:
|
||||
Loaded table data.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ========================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ========================================
|
||||
``%(filename)s`` ``""``
|
||||
``%(format_name)s`` ``"ltsv"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ========================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidHeaderNameError:
|
||||
If an invalid label name is included in the LTSV file.
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the LTSV data is invalid.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
self._ltsv_input_stream = self.source.splitlines()
|
||||
|
||||
for data_matrix in self._to_data_matrix():
|
||||
formatter = SingleJsonTableConverterA(data_matrix)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}{:s}".format(tnt.FORMAT_NAME, tnt.FORMAT_ID)
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,148 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
|
||||
from .._constant import (
|
||||
Default,
|
||||
SourceType,
|
||||
TableNameTemplate as tnt
|
||||
)
|
||||
from .._logger import (
|
||||
FileSourceLogger,
|
||||
TextSourceLogger,
|
||||
)
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..interface import TableLoader
|
||||
from .formatter import MarkdownTableFormatter
|
||||
|
||||
|
||||
class MarkdownTableLoader(TableLoader):
|
||||
"""
|
||||
The abstract class of Markdown table loaders.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "markdown"
|
||||
|
||||
|
||||
class MarkdownTableFileLoader(MarkdownTableLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from Markdown files.
|
||||
|
||||
:param str file_path: Path to the loading Markdown file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s_%(key)s``.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(MarkdownTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self.encoding = Default.ENCODING
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a Markdown file.
|
||||
|load_source_desc_file|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(key)s`` ``%(format_name)s%(format_id)s``
|
||||
``%(format_name)s`` ``"markdown"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the Markdown data is invalid or empty.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
with io.open(self.source, "r", encoding=self.encoding) as fp:
|
||||
formatter = MarkdownTableFormatter(fp.read())
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}_{:s}".format(tnt.FILENAME, tnt.KEY)
|
||||
|
||||
|
||||
class MarkdownTableTextLoader(MarkdownTableLoader):
|
||||
"""
|
||||
A text loader class to extract tabular data from Markdown text data.
|
||||
|
||||
:param str text: Markdown text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(key)s``.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return SourceType.TEXT
|
||||
|
||||
def __init__(self, text):
|
||||
super(MarkdownTableTextLoader, self).__init__(text)
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
self._logger = TextSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a Markdown text
|
||||
object.
|
||||
|load_source_desc_text|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` ``""``
|
||||
``%(key)s`` ``%(format_name)s%(format_id)s``
|
||||
``%(format_name)s`` ``"markdown"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the Markdown data is invalid or empty.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
formatter = MarkdownTableFormatter(self.source)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}".format(tnt.KEY)
|
||||
Binary file not shown.
@ -0,0 +1,25 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pytablereader import InvalidDataError
|
||||
import typepy
|
||||
|
||||
from ..html.formatter import HtmlTableFormatter
|
||||
|
||||
|
||||
class MarkdownTableFormatter(HtmlTableFormatter):
|
||||
|
||||
def __init__(self, source_data):
|
||||
import markdown2
|
||||
|
||||
if typepy.is_null_string(source_data):
|
||||
raise InvalidDataError
|
||||
|
||||
super(MarkdownTableFormatter, self).__init__(
|
||||
markdown2.markdown(source_data, extras=["tables"]))
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,156 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
|
||||
from .._constant import (
|
||||
Default,
|
||||
SourceType,
|
||||
TableNameTemplate as tnt
|
||||
)
|
||||
from .._logger import (
|
||||
FileSourceLogger,
|
||||
TextSourceLogger,
|
||||
)
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..interface import TableLoader
|
||||
from .formatter import MediaWikiTableFormatter
|
||||
|
||||
|
||||
class MediaWikiTableLoader(TableLoader):
|
||||
"""
|
||||
The abstract class of MediaWiki table loaders.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "mediawiki"
|
||||
|
||||
|
||||
class MediaWikiTableFileLoader(MediaWikiTableLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from MediaWiki files.
|
||||
|
||||
:param str file_path: Path to the loading file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s_%(key)s``.
|
||||
"""
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(MediaWikiTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self.encoding = Default.ENCODING
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a MediaWiki file.
|
||||
|load_source_desc_file|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(key)s`` | This replaced to:
|
||||
| **(1)** ``caption`` mark of the table
|
||||
| **(2)** ``%(format_name)s%(format_id)s``
|
||||
| if ``caption`` mark not included
|
||||
| in the table.
|
||||
``%(format_name)s`` ``"mediawiki"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the MediaWiki data is invalid or empty.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
with io.open(self.source, "r", encoding=self.encoding) as fp:
|
||||
formatter = MediaWikiTableFormatter(fp.read())
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}_{:s}".format(tnt.FILENAME, tnt.KEY)
|
||||
|
||||
|
||||
class MediaWikiTableTextLoader(MediaWikiTableLoader):
|
||||
"""
|
||||
A text loader class to extract tabular data from MediaWiki text data.
|
||||
|
||||
:param str text: MediaWiki text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(key)s``.
|
||||
"""
|
||||
|
||||
@property
|
||||
def source_type(self):
|
||||
return SourceType.TEXT
|
||||
|
||||
def __init__(self, text):
|
||||
super(MediaWikiTableTextLoader, self).__init__(text)
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
self._logger = TextSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a MediaWiki text
|
||||
object.
|
||||
|load_source_desc_text|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` ``""``
|
||||
``%(key)s`` | This replaced to:
|
||||
| **(1)** ``caption`` mark of the table
|
||||
| **(2)** ``%(format_name)s%(format_id)s``
|
||||
| if ``caption`` mark not included
|
||||
| in the table.
|
||||
``%(format_name)s`` ``"mediawiki"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the MediaWiki data is invalid or empty.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
formatter = MediaWikiTableFormatter(self.source)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}".format(tnt.KEY)
|
||||
Binary file not shown.
@ -0,0 +1,23 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..error import PypandocImportError
|
||||
from ..html.formatter import HtmlTableFormatter
|
||||
|
||||
|
||||
class MediaWikiTableFormatter(HtmlTableFormatter):
|
||||
|
||||
def __init__(self, source_data):
|
||||
try:
|
||||
import pypandoc
|
||||
except ImportError as e:
|
||||
raise PypandocImportError(e)
|
||||
|
||||
super(MediaWikiTableFormatter, self).__init__(
|
||||
pypandoc.convert_text(source_data, "html", format="mediawiki"))
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,69 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import abc
|
||||
|
||||
from .._constant import TableNameTemplate as tnt
|
||||
from ..interface import TableLoader
|
||||
|
||||
|
||||
class SpreadSheetLoader(TableLoader):
|
||||
"""
|
||||
An abstract class of table data.
|
||||
Especially spreadsheets that consists multiple rows.
|
||||
|
||||
.. py:attribute:: start_row
|
||||
|
||||
The first row to search header row.
|
||||
"""
|
||||
|
||||
def __init__(self, source):
|
||||
super(SpreadSheetLoader, self).__init__(source)
|
||||
|
||||
self.start_row = 0
|
||||
self._worksheet = None
|
||||
self._start_col_idx = None
|
||||
self._end_col_idx = None
|
||||
|
||||
@abc.abstractproperty
|
||||
def _sheet_name(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
def _row_count(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractproperty
|
||||
def _col_count(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _is_empty_sheet(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def _get_start_row_idx(self): # pragma: no cover
|
||||
pass
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "spreadsheet"
|
||||
|
||||
def _make_table_name(self):
|
||||
kv_mapping = self._get_basic_tablename_keyvalue_mapping()
|
||||
|
||||
try:
|
||||
kv_mapping[tnt.SHEET] = self._sheet_name
|
||||
except AttributeError:
|
||||
kv_mapping[tnt.SHEET] = ""
|
||||
|
||||
return self._expand_table_name_format(kv_mapping)
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}".format(tnt.SHEET)
|
||||
Binary file not shown.
@ -0,0 +1,160 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pytablereader import InvalidDataError
|
||||
from tabledata import TableData
|
||||
import xlrd
|
||||
|
||||
from six.moves import range
|
||||
|
||||
from .._logger import FileSourceLogger
|
||||
from .._validator import FileValidator
|
||||
from ..error import OpenError
|
||||
from .core import SpreadSheetLoader
|
||||
|
||||
|
||||
class ExcelTableFileLoader(SpreadSheetLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from Microsoft Excel |TM|
|
||||
files.
|
||||
|
||||
:param str file_path: Path to the loading Excel workbook file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(sheet)s``.
|
||||
|
||||
.. py:attribute:: start_row
|
||||
|
||||
The first row to search header row.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "excel"
|
||||
|
||||
@property
|
||||
def _sheet_name(self):
|
||||
return self._worksheet.name
|
||||
|
||||
@property
|
||||
def _row_count(self):
|
||||
return self._worksheet.nrows
|
||||
|
||||
@property
|
||||
def _col_count(self):
|
||||
return self._worksheet.ncols
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(ExcelTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
self._logger = FileSourceLogger(self)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from an Excel file.
|
||||
|spreadsheet_load_desc|
|
||||
|
||||
:return:
|
||||
Loaded |TableData| iterator.
|
||||
|TableData| created for each sheet in the workbook.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ====================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ====================================
|
||||
``%(filename)s`` Filename of the workbook
|
||||
``%(sheet)s`` Name of the sheet
|
||||
``%(format_name)s`` ``"spreadsheet"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ====================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the header row is not found.
|
||||
:raises pytablereader.error.OpenError:
|
||||
If failed to open the source file.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
self._logger.logging_load()
|
||||
|
||||
try:
|
||||
workbook = xlrd.open_workbook(self.source)
|
||||
except xlrd.biffh.XLRDError as e:
|
||||
raise OpenError(e)
|
||||
|
||||
for worksheet in workbook.sheets():
|
||||
self._worksheet = worksheet
|
||||
|
||||
if self._is_empty_sheet():
|
||||
continue
|
||||
|
||||
self.__extract_not_empty_col_idx()
|
||||
|
||||
try:
|
||||
start_row_idx = self._get_start_row_idx()
|
||||
except InvalidDataError:
|
||||
continue
|
||||
|
||||
header_list = self.__get_row_values(start_row_idx)
|
||||
record_list = [
|
||||
self.__get_row_values(row_idx)
|
||||
for row_idx in range(start_row_idx + 1, self._row_count)
|
||||
]
|
||||
|
||||
self.inc_table_count()
|
||||
|
||||
yield TableData(
|
||||
self._make_table_name(), header_list, record_list,
|
||||
is_strip_quote=True, quoting_flags=self.quoting_flags)
|
||||
|
||||
def _is_empty_sheet(self):
|
||||
return any([
|
||||
self._col_count == 0,
|
||||
self._row_count <= 1,
|
||||
# nrows == 1 means exists header row only
|
||||
])
|
||||
|
||||
def _get_start_row_idx(self):
|
||||
for row_idx in range(self.start_row, self._row_count):
|
||||
if self.__is_header_row(row_idx):
|
||||
break
|
||||
else:
|
||||
raise InvalidDataError("header row not found")
|
||||
|
||||
return row_idx
|
||||
|
||||
def __is_header_row(self, row_idx):
|
||||
cell_type_list = self._worksheet.row_types(
|
||||
row_idx, self._start_col_idx, self._end_col_idx + 1)
|
||||
return xlrd.XL_CELL_EMPTY not in cell_type_list
|
||||
|
||||
@staticmethod
|
||||
def __is_empty_cell_type_list(cell_type_list):
|
||||
return all([
|
||||
cell_type == xlrd.XL_CELL_EMPTY
|
||||
for cell_type in cell_type_list
|
||||
])
|
||||
|
||||
def __extract_not_empty_col_idx(self):
|
||||
col_idx_list = [
|
||||
col_idx
|
||||
for col_idx in range(self._col_count)
|
||||
if not self.__is_empty_cell_type_list(
|
||||
self._worksheet.col_types(col_idx))
|
||||
]
|
||||
|
||||
self._start_col_idx = min(col_idx_list)
|
||||
self._end_col_idx = max(col_idx_list)
|
||||
|
||||
def __get_row_values(self, row_idx):
|
||||
return self._worksheet.row_values(
|
||||
row_idx, self._start_col_idx, self._end_col_idx + 1)
|
||||
Binary file not shown.
@ -0,0 +1,184 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
from tabledata import TableData
|
||||
import typepy
|
||||
|
||||
from .._constant import TableNameTemplate as tnt
|
||||
from .._validator import TextValidator
|
||||
from ..error import OpenError
|
||||
from .core import SpreadSheetLoader
|
||||
|
||||
|
||||
class GoogleSheetsTableLoader(SpreadSheetLoader):
|
||||
"""
|
||||
Concrete class of Google Spreadsheet loader.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(sheet)s``.
|
||||
|
||||
:param str file_path: Path to the Google Sheets credential JSON file.
|
||||
|
||||
:Dependency Packages:
|
||||
- `gspread <https://github.com/burnash/gspread>`_
|
||||
- `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`_
|
||||
- `oauth2client <https://pypi.python.org/pypi/oauth2client>`_
|
||||
- `pyOpenSSL <https://pypi.python.org/pypi/pyOpenSSL>`_
|
||||
|
||||
:Examples:
|
||||
:ref:`example-gs-table-loader`
|
||||
"""
|
||||
|
||||
@property
|
||||
def _sheet_name(self):
|
||||
return self._worksheet.title
|
||||
|
||||
@property
|
||||
def _row_count(self):
|
||||
return self._worksheet.row_count
|
||||
|
||||
@property
|
||||
def _col_count(self):
|
||||
return self._worksheet.col_count
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(GoogleSheetsTableLoader, self).__init__(file_path)
|
||||
|
||||
self.title = None
|
||||
self.start_row = 0
|
||||
|
||||
self._validator = TextValidator(file_path)
|
||||
|
||||
self.__all_values = None
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Load table data from a Google Spreadsheet.
|
||||
|
||||
This method consider :py:attr:`.source` as a path to the
|
||||
credential JSON file to access Google Sheets API.
|
||||
|
||||
The method automatically search the header row start from
|
||||
:py:attr:`.start_row`. The condition of the header row is that
|
||||
all of the columns have value (except empty columns).
|
||||
|
||||
:return:
|
||||
Loaded table data. Return one |TableData| for each sheet in
|
||||
the workbook. The table name for data will be determined by
|
||||
:py:meth:`~.GoogleSheetsTableLoader.make_table_name`.
|
||||
:rtype: iterator of |TableData|
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the header row is not found.
|
||||
:raises pytablereader.OpenError:
|
||||
If the spread sheet not found.
|
||||
"""
|
||||
|
||||
import gspread
|
||||
from oauth2client.service_account import ServiceAccountCredentials
|
||||
|
||||
self._validate_table_name()
|
||||
self._validate_title()
|
||||
|
||||
scope = ['https://spreadsheets.google.com/feeds']
|
||||
credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
||||
self.source, scope)
|
||||
|
||||
gc = gspread.authorize(credentials)
|
||||
try:
|
||||
for worksheet in gc.open(self.title).worksheets():
|
||||
self._worksheet = worksheet
|
||||
self.__all_values = [row for row in worksheet.get_all_values()]
|
||||
|
||||
if self._is_empty_sheet():
|
||||
continue
|
||||
|
||||
try:
|
||||
self.__strip_empty_col()
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
value_matrix = self.__all_values[self._get_start_row_idx():]
|
||||
try:
|
||||
header_list = value_matrix[0]
|
||||
record_list = value_matrix[1:]
|
||||
except IndexError:
|
||||
continue
|
||||
|
||||
self.inc_table_count()
|
||||
|
||||
yield TableData(
|
||||
self.make_table_name(), header_list, record_list,
|
||||
quoting_flags=self.quoting_flags)
|
||||
except gspread.exceptions.SpreadsheetNotFound:
|
||||
raise OpenError("spreadsheet '{}' not found".format(self.title))
|
||||
|
||||
def _is_empty_sheet(self):
|
||||
return len(self.__all_values) <= 1
|
||||
|
||||
def _get_start_row_idx(self):
|
||||
row_idx = 0
|
||||
for row_value_list in self.__all_values:
|
||||
if all([
|
||||
typepy.is_not_null_string(value)
|
||||
for value in row_value_list
|
||||
]):
|
||||
break
|
||||
|
||||
row_idx += 1
|
||||
|
||||
return self.start_row + row_idx
|
||||
|
||||
def _validate_title(self):
|
||||
if typepy.is_null_string(self.title):
|
||||
raise ValueError("spreadsheet title is empty")
|
||||
|
||||
def _make_table_name(self):
|
||||
self._validate_title()
|
||||
|
||||
kv_mapping = self._get_basic_tablename_keyvalue_mapping()
|
||||
kv_mapping[tnt.TITLE] = self.title
|
||||
try:
|
||||
kv_mapping[tnt.SHEET] = self._sheet_name
|
||||
except AttributeError:
|
||||
kv_mapping[tnt.SHEET] = ""
|
||||
|
||||
return self._expand_table_name_format(kv_mapping)
|
||||
|
||||
def __strip_empty_col(self):
|
||||
from simplesqlite import connect_sqlite_memdb
|
||||
from simplesqlite.sqlquery import SqlQuery
|
||||
|
||||
con = connect_sqlite_memdb()
|
||||
|
||||
tmp_table_name = "tmp"
|
||||
header_list = [
|
||||
"a{:d}".format(i)
|
||||
for i in range(len(self.__all_values[0]))
|
||||
]
|
||||
con.create_table_from_data_matrix(
|
||||
table_name=tmp_table_name,
|
||||
attr_name_list=header_list,
|
||||
data_matrix=self.__all_values)
|
||||
for col_idx, header in enumerate(header_list):
|
||||
result = con.select(
|
||||
select=SqlQuery.to_attr_str(header), table_name=tmp_table_name)
|
||||
if any([
|
||||
typepy.is_not_null_string(record[0])
|
||||
for record in result.fetchall()
|
||||
]):
|
||||
break
|
||||
|
||||
strip_header_list = header_list[col_idx:]
|
||||
if typepy.is_empty_sequence(strip_header_list):
|
||||
raise ValueError()
|
||||
|
||||
result = con.select(
|
||||
select=",".join(SqlQuery.to_attr_str_list(strip_header_list)),
|
||||
table_name=tmp_table_name)
|
||||
self.__all_values = result.fetchall()
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,70 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._constant import TableNameTemplate as tnt
|
||||
from .._validator import FileValidator
|
||||
from ..interface import TableLoader
|
||||
from .formatter import SqliteTableFormatter
|
||||
|
||||
|
||||
class SqliteFileLoader(TableLoader):
|
||||
"""
|
||||
A file loader class to extract tabular data from SQLite database files.
|
||||
|
||||
:param str file_path: Path to the loading SQLite database file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s_%(key)s``.
|
||||
|
||||
:Dependency Packages:
|
||||
- `SimpleSQLite <https://github.com/thombashi/SimpleSQLite>`__
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "sqlite"
|
||||
|
||||
def __init__(self, file_path=None):
|
||||
super(SqliteFileLoader, self).__init__(file_path)
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
|
||||
def load(self):
|
||||
"""
|
||||
Extract tabular data as |TableData| instances from a SQLite database
|
||||
file. |load_source_desc_file|
|
||||
|
||||
:return:
|
||||
Loaded table data iterator.
|
||||
|load_table_name_desc|
|
||||
|
||||
=================== ==============================================
|
||||
Format specifier Value after the replacement
|
||||
=================== ==============================================
|
||||
``%(filename)s`` |filename_desc|
|
||||
``%(key)s`` ``%(format_name)s%(format_id)s``
|
||||
``%(format_name)s`` ``"sqlite"``
|
||||
``%(format_id)s`` |format_id_desc|
|
||||
``%(global_id)s`` |global_id|
|
||||
=================== ==============================================
|
||||
:rtype: |TableData| iterator
|
||||
:raises pytablereader.InvalidDataError:
|
||||
If the SQLite database file data is invalid or empty.
|
||||
"""
|
||||
|
||||
self._validate()
|
||||
|
||||
formatter = SqliteTableFormatter(self.source)
|
||||
formatter.accept(self)
|
||||
|
||||
return formatter.to_table_data()
|
||||
|
||||
def _get_default_table_name_template(self):
|
||||
return "{:s}{:s}".format(tnt.FORMAT_NAME, tnt.FORMAT_ID)
|
||||
Binary file not shown.
@ -0,0 +1,50 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from pytablereader import InvalidDataError
|
||||
from tabledata import TableData
|
||||
import typepy
|
||||
|
||||
from .._constant import TableNameTemplate as tnt
|
||||
from ..formatter import TableFormatter
|
||||
|
||||
|
||||
class SqliteTableFormatter(TableFormatter):
|
||||
|
||||
def __init__(self, source_data):
|
||||
super(SqliteTableFormatter, self).__init__(source_data)
|
||||
|
||||
self.__table_name = None
|
||||
|
||||
if typepy.is_null_string(source_data):
|
||||
raise InvalidDataError
|
||||
|
||||
def to_table_data(self):
|
||||
from simplesqlite import SimpleSQLite
|
||||
from simplesqlite.sqlquery import SqlQuery
|
||||
|
||||
con = SimpleSQLite(self._source_data, "r")
|
||||
|
||||
for table in con.get_table_name_list():
|
||||
self.__table_name = table
|
||||
|
||||
attr_name_list = con.get_attr_name_list(table)
|
||||
data_matrix = con.select(
|
||||
select=",".join(SqlQuery.to_attr_str_list(attr_name_list)),
|
||||
table_name=table)
|
||||
|
||||
yield TableData(
|
||||
table, attr_name_list, data_matrix,
|
||||
quoting_flags=self._loader.quoting_flags)
|
||||
|
||||
def _make_table_name(self):
|
||||
return self._loader._expand_table_name_format(
|
||||
self._loader._get_basic_tablename_keyvalue_mapping() + [
|
||||
(tnt.KEY, self.__table_name),
|
||||
])
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,63 @@
|
||||
# encoding: utf-8
|
||||
|
||||
"""
|
||||
.. codeauthor:: Tsuyoshi Hombashi <tsuyoshi.hombashi@gmail.com>
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._validator import (
|
||||
FileValidator,
|
||||
TextValidator
|
||||
)
|
||||
from ..csv.core import (
|
||||
CsvTableFileLoader,
|
||||
CsvTableTextLoader
|
||||
)
|
||||
|
||||
|
||||
class TsvTableFileLoader(CsvTableFileLoader):
|
||||
"""
|
||||
Tab separated values (TSV) format file loader class.
|
||||
|
||||
:param str file_path: Path to the loading TSV file.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(filename)s``.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "tsv"
|
||||
|
||||
def __init__(self, file_path):
|
||||
super(TsvTableFileLoader, self).__init__(file_path)
|
||||
|
||||
self.delimiter = "\t"
|
||||
|
||||
self._validator = FileValidator(file_path)
|
||||
|
||||
|
||||
class TsvTableTextLoader(CsvTableTextLoader):
|
||||
"""
|
||||
Tab separated values (TSV) format text loader class.
|
||||
|
||||
:param str text: TSV text to load.
|
||||
|
||||
.. py:attribute:: table_name
|
||||
|
||||
Table name string. Defaults to ``%(format_name)s%(format_id)s``.
|
||||
"""
|
||||
|
||||
@property
|
||||
def format_name(self):
|
||||
return "tsv"
|
||||
|
||||
def __init__(self, text):
|
||||
super(TsvTableTextLoader, self).__init__(text)
|
||||
|
||||
self.delimiter = "\t"
|
||||
|
||||
self._validator = TextValidator(text)
|
||||
Binary file not shown.
Reference in New Issue
Block a user