Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions image_match/elasticsearch_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(self, es, index='images', doc_type='image', timeout='10s', size=100
def search_single_record(self, rec, pre_filter=None):
path = rec.pop('path')
signature = rec.pop('signature')

if 'metadata' in rec:
rec.pop('metadata')

Expand All @@ -70,27 +71,26 @@ def search_single_record(self, rec, pre_filter=None):
size=self.size,
timeout=self.timeout)['hits']['hits']

sigs = np.array([x['_source']['signature'] for x in res])

if sigs.size == 0:
if len(res) == 0:
return []

dists = normalized_distance(sigs, np.array(signature))

formatted_res = [{'id': x['_id'],
'score': x['_score'],
'metadata': x['_source'].get('metadata'),
'path': x['_source'].get('url', x['_source'].get('path'))}
for x in res]

for i, row in enumerate(formatted_res):
row['dist'] = dists[i]
formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res)
formatted_res = filter(lambda y: y['score'] > self.score_cutoff, formatted_res)

return formatted_res

def insert_single_record(self, rec, refresh_after=False):
rec['timestamp'] = datetime.now()

# Don't store signature in index
if 'signature' in rec:
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should include a setting in the class constructor whether we want to store the signature or not. The default should be yes, to keep it backwards compatible. I can work on this.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you think you can work on that? Or should I handle this?

ps Can't wait to get that PR in ;)

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can look at it sometime in the near future

rec.pop('signature')

self.es.index(index=self.index, doc_type=self.doc_type, body=rec, refresh=refresh_after)

def delete_duplicates(self, path):
Expand Down
40 changes: 36 additions & 4 deletions image_match/signature_database_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def insert_single_record(self, rec):
raise NotImplementedError

def __init__(self, k=16, N=63, n_grid=9,
crop_percentile=(5, 95), distance_cutoff=0.45,
crop_percentile=(5, 95), distance_cutoff=0.45, score_cutoff=9.0,
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stuff for Elasticsearch should be in the Elasticsearch driver. You can include a **kwargs here and pick it up in the child class (tell me if it's not clear what I'm saying).

*signature_args, **signature_kwargs):
"""Set up storage scheme for images

Expand Down Expand Up @@ -159,6 +159,8 @@ def __init__(self, k=16, N=63, n_grid=9,
considering how much variance to keep in the image (default (5, 95))
distance_cutoff (Optional [float]): maximum image signature distance to
be considered a match (default 0.45)
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the default value of 9.0 is the reason the test is failing ... only 8 words match, it looks like. Maybe I need to think a bit more about the default value, but if you change to something less than 8, the tests should pass.

score_cutoff (Optional [float]): minimum ElasticSearch relevance score to
be considered a match (default 9.0)
*signature_args: Variable length argument list to pass to ImageSignature
**signature_kwargs: Arbitrary keyword arguments to pass to ImageSignature

Expand All @@ -175,14 +177,22 @@ def __init__(self, k=16, N=63, n_grid=9,
self.N = N
self.n_grid = n_grid

# Check float input
# Check float input for distance cutoff
if type(distance_cutoff) is not float:
raise TypeError('distance_cutoff should be a float')
if distance_cutoff < 0.:
raise ValueError('distance_cutoff should be > 0 (got %r)' % distance_cutoff)

self.distance_cutoff = distance_cutoff

# Check float input for elasticsearch score cutoff
if type(score_cutoff) is not float:
raise TypeError('score_cutoff should be a float')
if score_cutoff < 0.:
raise ValueError('score_cutoff should be > 0 (got %r)' % score_cutoff)

self.score_cutoff = score_cutoff

self.crop_percentile = crop_percentile

self.gis = ImageSignature(n=n_grid, crop_percentiles=crop_percentile, *signature_args, **signature_kwargs)
Expand Down Expand Up @@ -222,7 +232,7 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
pre_filter (Optional[dict]): filters list before applying the matching algorithm
(default None)
Returns:
a formatted list of dicts representing unique matches, sorted by dist
a formatted list of dicts representing unique matches, sorted by dist or score (in case of using ElasticSearch)

For example, if three matches are found:

Expand All @@ -238,6 +248,19 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte
'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
]

Here is an ElasticSearch example:

[
{'score': 4.0,
'id': u'AVM37oZq0osmmAxpPvx7',
'path': u'https://pixabay.com/static/uploads/photo/2012/11/28/08/56/mona-lisa-67506_960_720.jpg'},
{'score': 35.0,
'id': u'AVM37nMg0osmmAxpPvx6',
'path': u'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'},
{'score': 10.0,
'id': u'AVM37p530osmmAxpPvx9',
'path': u'https://c2.staticflickr.com/8/7158/6814444991_08d82de57e_z.jpg'}
]
"""
img = self.gis.preprocess_image(path, bytestream)

Expand Down Expand Up @@ -277,12 +300,21 @@ def search_image(self, path, all_orientations=False, bytestream=False, pre_filte

ids = set()
unique = []
hasScore = False
for item in result:
if 'score' in item:
hasScore = True

if item['id'] not in ids:
unique.append(item)
ids.add(item['id'])

r = sorted(unique, key=itemgetter('dist'))
# If data comes from ElasticSearch - sort by score, otherwise - default to sorting by dist
if hasScore:
r = sorted(unique, key=itemgetter('score'), reverse=True)
else:
r = sorted(unique, key=itemgetter('dist'))

return r


Expand Down
24 changes: 10 additions & 14 deletions tests/test_elasticsearch_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
DOC_TYPE = 'image'
MAPPINGS = {
"mappings": {
DOC_TYPE: {
DOC_TYPE: {
"dynamic": True,
"properties": {
"metadata": {
"properties": {
"metadata": {
"type": "object",
"dynamic": True,
"properties": {
"properties": {
"tenant_id": { "type": "keyword" }
}
}
}
}
}
Expand Down Expand Up @@ -122,7 +122,6 @@ def test_lookup_from_url(ses):
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]


Expand All @@ -132,7 +131,6 @@ def test_lookup_from_file(ses):
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]

def test_lookup_from_bytestream(ses):
Expand All @@ -142,7 +140,6 @@ def test_lookup_from_bytestream(ses):
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]

def test_lookup_with_cutoff(ses):
Expand Down Expand Up @@ -171,7 +168,6 @@ def test_add_image_with_metadata(ses):
assert r[0]['metadata'] == metadata
assert 'path' in r[0]
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]


Expand All @@ -190,13 +186,13 @@ def test_lookup_with_filter_by_metadata(ses):
assert len(r) == 1
assert r[0]['metadata'] == metadata

r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}})
r = ses.search_image('test2.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-2"}})
assert len(r) == 1
assert r[0]['metadata'] == metadata2

r = ses.search_image('test1.jpg', pre_filter={"term": {"metadata.tenant_id": "bar-3"}})
assert len(r) == 0


def test_all_orientations(ses):
im = Image.open('test1.jpg')
Expand All @@ -206,12 +202,12 @@ def test_all_orientations(ses):
r = ses.search_image('rotated_test1.jpg', all_orientations=True)
assert len(r) == 1
assert r[0]['path'] == 'test1.jpg'
assert r[0]['dist'] < 0.05 # some error from rotation
assert r[0]['score'] > 55 # some error from rotation

with open('rotated_test1.jpg', 'rb') as f:
r = ses.search_image(f.read(), bytestream=True, all_orientations=True)
assert len(r) == 1
assert r[0]['dist'] < 0.05 # some error from rotation
assert r[0]['score'] > 55 # some error from rotation


def test_duplicate(ses):
Expand All @@ -220,8 +216,8 @@ def test_duplicate(ses):
r = ses.search_image('test1.jpg')
assert len(r) == 2
assert r[0]['path'] == 'test1.jpg'
assert r[1]['path'] == 'test1.jpg'
assert 'score' in r[0]
assert 'dist' in r[0]
assert 'id' in r[0]


Expand Down
24 changes: 12 additions & 12 deletions tests/test_elasticsearch_driver_metadata_as_nested.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@
DOC_TYPE = 'image'
MAPPINGS = {
"mappings": {
DOC_TYPE: {
DOC_TYPE: {
"dynamic": True,
"properties": {
"metadata": {
"properties": {
"metadata": {
"type": "nested",
"dynamic": True,
"properties": {
"properties": {
"tenant_id": { "type": "keyword" },
"project_id": { "type": "keyword" }
}
}
}
}
}
Expand Down Expand Up @@ -101,23 +101,23 @@ def test_lookup_with_filter_by_metadata(ses):
assert len(r) == 2

r = ses.search_image('test1.jpg', pre_filter=_nested_filter('foo', 'project-z'))
assert len(r) == 0
assert len(r) == 0

r = ses.search_image('test1.jpg', pre_filter=_nested_filter('bar', 'project-x'))
assert len(r) == 1

r = ses.search_image('test1.jpg', pre_filter=_nested_filter('bar-2', 'project-x'))
assert len(r) == 0

r = ses.search_image('test1.jpg', pre_filter=_nested_filter('bar', 'project-z'))
assert len(r) == 0
assert len(r) == 0

def _metadata(tenant_id, project_id):
return dict(
tenant_id=tenant_id,
project_id=project_id
)

def _nested_filter(tenant_id, project_id):
return {
"nested" : {
Expand All @@ -129,6 +129,6 @@ def _nested_filter(tenant_id, project_id):
{"term": {"metadata.project_id": project_id}}
]
}
}
}
}
}
}