diff options
| -rw-r--r-- | cmds/incidentd/Android.bp | 1 | ||||
| -rw-r--r-- | cmds/incidentd/src/FdBuffer.cpp | 21 | ||||
| -rw-r--r-- | cmds/incidentd/src/IncidentService.cpp | 8 | ||||
| -rw-r--r-- | cmds/incidentd/src/PrivacyBuffer.cpp | 15 | ||||
| -rw-r--r-- | cmds/incidentd/src/Reporter.cpp | 12 | ||||
| -rw-r--r-- | cmds/incidentd/src/Section.cpp | 86 | ||||
| -rw-r--r-- | libs/incident/proto/android/os/metadata.proto | 4 |
7 files changed, 95 insertions, 52 deletions
diff --git a/cmds/incidentd/Android.bp b/cmds/incidentd/Android.bp index 1e970f46b01d..40da583e4f6c 100644 --- a/cmds/incidentd/Android.bp +++ b/cmds/incidentd/Android.bp @@ -53,6 +53,7 @@ cc_binary { "libprotoutil", "libservices", "libutils", + "libprotobuf-cpp-lite", ], init_rc: ["incidentd.rc"], diff --git a/cmds/incidentd/src/FdBuffer.cpp b/cmds/incidentd/src/FdBuffer.cpp index a8ef8311720d..74cda1da7216 100644 --- a/cmds/incidentd/src/FdBuffer.cpp +++ b/cmds/incidentd/src/FdBuffer.cpp @@ -47,9 +47,13 @@ status_t FdBuffer::read(int fd, int64_t timeout) { while (true) { if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) { mTruncated = true; + VLOG("Truncating data"); break; } - if (mBuffer.writeBuffer() == NULL) return NO_MEMORY; + if (mBuffer.writeBuffer() == NULL) { + VLOG("No memory"); + return NO_MEMORY; + } int64_t remainingTime = (mStartTime + timeout) - uptimeMillis(); if (remainingTime <= 0) { @@ -58,7 +62,7 @@ status_t FdBuffer::read(int fd, int64_t timeout) { break; } - int count = poll(&pfds, 1, remainingTime); + int count = TEMP_FAILURE_RETRY(poll(&pfds, 1, remainingTime)); if (count == 0) { VLOG("timed out due to block calling poll"); mTimedOut = true; @@ -102,7 +106,10 @@ status_t FdBuffer::readFully(int fd) { VLOG("Truncating data"); break; } - if (mBuffer.writeBuffer() == NULL) return NO_MEMORY; + if (mBuffer.writeBuffer() == NULL) { + VLOG("No memory"); + return NO_MEMORY; + } ssize_t amt = TEMP_FAILURE_RETRY(::read(fd, mBuffer.writeBuffer(), mBuffer.currentToWrite())); @@ -144,10 +151,14 @@ status_t FdBuffer::readProcessedDataInStream(int fd, unique_fd toFd, unique_fd f // This is the buffer used to store processed data while (true) { if (mBuffer.size() >= MAX_BUFFER_COUNT * BUFFER_SIZE) { + VLOG("Truncating data"); mTruncated = true; break; } - if (mBuffer.writeBuffer() == NULL) return NO_MEMORY; + if (mBuffer.writeBuffer() == NULL) { + VLOG("No memory"); + return NO_MEMORY; + } int64_t remainingTime = (mStartTime + timeoutMs) - uptimeMillis(); if (remainingTime <= 0) { @@ -157,7 +168,7 @@ status_t FdBuffer::readProcessedDataInStream(int fd, unique_fd toFd, unique_fd f } // wait for any pfds to be ready to perform IO - int count = poll(pfds, 3, remainingTime); + int count = TEMP_FAILURE_RETRY(poll(pfds, 3, remainingTime)); if (count == 0) { VLOG("timed out due to block calling poll"); mTimedOut = true; diff --git a/cmds/incidentd/src/IncidentService.cpp b/cmds/incidentd/src/IncidentService.cpp index b23c1eda87b3..3d892adcb195 100644 --- a/cmds/incidentd/src/IncidentService.cpp +++ b/cmds/incidentd/src/IncidentService.cpp @@ -42,13 +42,11 @@ enum { WHAT_RUN_REPORT = 1, WHAT_SEND_BACKLOG_TO_DROPBOX = 2 }; #define DEFAULT_BYTES_SIZE_LIMIT (20 * 1024 * 1024) // 20MB #define DEFAULT_REFACTORY_PERIOD_MS (24 * 60 * 60 * 1000) // 1 Day -// Skip logs (1100 - 1108) because they are already in the bug report -// Skip 1200, 1201, 1202, 3018 because they take too long -// TODO(120079956): Skip 3008, 3015 because of error +// Skip these sections for dumpstate only. Dumpstate allows 10s max for each service to dump. +// Skip logs (1100 - 1108) and traces (1200 - 1202) because they are already in the bug report. +// Skip 3018 because it takes too long. #define SKIPPED_SECTIONS { 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, /* Logs */ \ 1200, 1201, 1202, /* Native, hal, java traces */ \ - 3008, /* "package --proto" */ \ - 3015, /* "activity --proto processes" */ \ 3018 /* "meminfo -a --proto" */ } namespace android { diff --git a/cmds/incidentd/src/PrivacyBuffer.cpp b/cmds/incidentd/src/PrivacyBuffer.cpp index 7a8ebe394d51..6967e3391fb8 100644 --- a/cmds/incidentd/src/PrivacyBuffer.cpp +++ b/cmds/incidentd/src/PrivacyBuffer.cpp @@ -96,7 +96,12 @@ status_t PrivacyBuffer::stripField(const Privacy* parentPolicy, const PrivacySpe uint64_t token = mProto.start(encode_field_id(policy)); while (mData.rp()->pos() - start != msgSize) { status_t err = stripField(policy, spec, depth + 1); - if (err != NO_ERROR) return err; + if (err != NO_ERROR) { + VLOG("Bad value when stripping id %d, wiretype %d, tag %#x, depth %d, size %d, " + "relative pos %zu, ", fieldId, read_wire_type(fieldTag), fieldTag, depth, + msgSize, mData.rp()->pos() - start); + return err; + } } mProto.end(token); return NO_ERROR; @@ -117,9 +122,13 @@ status_t PrivacyBuffer::strip(const PrivacySpec& spec) { } while (mData.hasNext()) { status_t err = stripField(mPolicy, spec, 0); - if (err != NO_ERROR) return err; + if (err != NO_ERROR) return err; // Error logged in stripField. + } + if (mData.bytesRead() != mData.size()) { + VLOG("Buffer corrupted: expect %zu bytes, read %zu bytes", + mData.size(), mData.bytesRead()); + return BAD_VALUE; } - if (mData.bytesRead() != mData.size()) return BAD_VALUE; mSize = mProto.size(); mData.rp()->rewind(); // rewind the read pointer back to beginning after the strip. return NO_ERROR; diff --git a/cmds/incidentd/src/Reporter.cpp b/cmds/incidentd/src/Reporter.cpp index b3bab0cfb4dc..8f62da202606 100644 --- a/cmds/incidentd/src/Reporter.cpp +++ b/cmds/incidentd/src/Reporter.cpp @@ -129,6 +129,7 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) { bool needMainFd = false; int mainFd = -1; int mainDest = -1; + int sectionCount = 0; HeaderSection headers; MetadataSection metadataSection; std::string buildType = android::base::GetProperty("ro.build.type", ""); @@ -180,12 +181,12 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) { for (const Section** section = SECTION_LIST; *section; section++) { const int id = (*section)->id; if ((*section)->userdebugAndEngOnly && !isUserdebugOrEng) { - ALOGD("Skipping incident report section %d '%s' because it's limited to userdebug/eng", + VLOG("Skipping incident report section %d '%s' because it's limited to userdebug/eng", id, (*section)->name.string()); continue; } if (this->batch.containsSection(id)) { - ALOGD("Taking incident report section %d '%s'", id, (*section)->name.string()); + VLOG("Taking incident report section %d '%s'", id, (*section)->name.string()); for (ReportRequestSet::iterator it = batch.begin(); it != batch.end(); it++) { if ((*it)->listener != NULL && (*it)->args.containsSection(id)) { (*it)->listener->onReportSectionStatus( @@ -198,11 +199,12 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) { int64_t startTime = uptimeMillis(); err = (*section)->Execute(&batch); int64_t endTime = uptimeMillis(); - stats->set_success(err == NO_ERROR); stats->set_exec_duration_ms(endTime - startTime); if (err != NO_ERROR) { ALOGW("Incident section %s (%d) failed: %s. Stopping report.", (*section)->name.string(), id, strerror(-err)); + // Execute() has already recorded this status. Only update if there's new failure. + stats->set_success(false); goto DONE; } (*reportByteSize) += stats->report_size_bytes(); @@ -214,11 +216,13 @@ Reporter::run_report_status_t Reporter::runReport(size_t* reportByteSize) { id, IIncidentReportStatusListener::STATUS_FINISHED); } } - ALOGD("Finish incident report section %d '%s'", id, (*section)->name.string()); + VLOG("Finish incident report section %d '%s'", id, (*section)->name.string()); + sectionCount++; } } DONE: + ALOGD("Incident reporting took %d sections.", sectionCount); // Reports the metdadata when taking the incident report. if (!isTest) metadataSection.Execute(&batch); diff --git a/cmds/incidentd/src/Section.cpp b/cmds/incidentd/src/Section.cpp index cd48af95d08a..10d226822331 100644 --- a/cmds/incidentd/src/Section.cpp +++ b/cmds/incidentd/src/Section.cpp @@ -75,6 +75,7 @@ static void write_section_stats(IncidentMetadata::SectionStats* stats, const FdB stats->set_dump_duration_ms(buffer.durationMs()); stats->set_timed_out(buffer.timedOut()); stats->set_is_truncated(buffer.truncated()); + stats->set_success(!buffer.timedOut() && !buffer.truncated()); } // Reads data from FdBuffer and writes it to the requests file descriptor. @@ -83,7 +84,8 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer, status_t err = -EBADF; EncodedBuffer::iterator data = buffer.data(); PrivacyBuffer privacyBuffer(get_privacy_of_section(id), data); - int writeable = 0; + IncidentMetadata::SectionStats* stats = requests->sectionStats(id); + int nPassed = 0; // The streaming ones, group requests by spec in order to save unnecessary strip operations map<PrivacySpec, vector<sp<ReportRequest>>> requestsBySpec; @@ -99,7 +101,18 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer, for (auto mit = requestsBySpec.begin(); mit != requestsBySpec.end(); mit++) { PrivacySpec spec = mit->first; err = privacyBuffer.strip(spec); - if (err != NO_ERROR) return err; // it means the privacyBuffer data is corrupted. + if (err != NO_ERROR) { + // Privacy Buffer is corrupted, probably due to an ill-formatted proto. This is a + // non-fatal error. The whole report is still valid. So just log the failure. + ALOGW("Failed to strip section %d with spec %d: %s", + id, spec.dest, strerror(-err)); + stats->set_success(false); + stats->set_error_msg("Failed to strip section: privacy buffer corrupted, probably " + "due to an ill-formatted proto"); + nPassed++; + continue; + } + if (privacyBuffer.size() == 0) continue; for (auto it = mit->second.begin(); it != mit->second.end(); it++) { @@ -114,7 +127,7 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer, request->err = err; continue; } - writeable++; + nPassed++; VLOG("Section %d flushed %zu bytes to fd %d with spec %d", id, privacyBuffer.size(), request->fd, spec.dest); } @@ -125,7 +138,15 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer, if (requests->mainFd() >= 0) { PrivacySpec spec = PrivacySpec::new_spec(requests->mainDest()); err = privacyBuffer.strip(spec); - if (err != NO_ERROR) return err; // the buffer data is corrupted. + if (err != NO_ERROR) { + ALOGW("Failed to strip section %d with spec %d for dropbox: %s", + id, spec.dest, strerror(-err)); + stats->set_success(false); + stats->set_error_msg("Failed to strip section: privacy buffer corrupted, probably " + "due to an ill-formatted proto"); + nPassed++; + goto DONE; + } if (privacyBuffer.size() == 0) goto DONE; err = write_section_header(requests->mainFd(), id, privacyBuffer.size()); @@ -138,7 +159,7 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer, requests->setMainFd(-1); goto DONE; } - writeable++; + nPassed++; VLOG("Section %d flushed %zu bytes to dropbox %d with spec %d", id, privacyBuffer.size(), requests->mainFd(), spec.dest); // Reports bytes of the section uploaded via dropbox after filtering. @@ -147,7 +168,7 @@ static status_t write_report_requests(const int id, const FdBuffer& buffer, DONE: // only returns error if there is no fd to write to. - return writeable > 0 ? NO_ERROR : err; + return nPassed > 0 ? NO_ERROR : err; } // ================================================================================ @@ -213,6 +234,8 @@ status_t MetadataSection::Execute(ReportRequestSet* requests) const { stats.timed_out()); proto.write(FIELD_TYPE_BOOL | IncidentMetadata::SectionStats::kIsTruncatedFieldNumber, stats.is_truncated()); + proto.write(FIELD_TYPE_STRING | IncidentMetadata::SectionStats::kErrorMsgFieldNumber, + stats.error_msg()); proto.end(token); } @@ -439,8 +462,9 @@ status_t WorkerThreadSection::Execute(ReportRequestSet* requests) const { status_t err = NO_ERROR; pthread_t thread; pthread_attr_t attr; - bool timedOut = false; + bool workerDone = false; FdBuffer buffer; + IncidentMetadata::SectionStats* stats = requests->sectionStats(this->id); // Data shared between this thread and the worker thread. sp<WorkerThreadData> data = new WorkerThreadData(this); @@ -475,8 +499,7 @@ status_t WorkerThreadSection::Execute(ReportRequestSet* requests) const { // Loop reading until either the timeout or the worker side is done (i.e. eof). err = buffer.read(data->pipe.readFd().get(), this->timeoutMs); if (err != NO_ERROR) { - // TODO: Log this error into the incident report. - ALOGW("[%s] reader failed with error '%s'", this->name.string(), strerror(-err)); + ALOGE("[%s] reader failed with error '%s'", this->name.string(), strerror(-err)); } // Done with the read fd. The worker thread closes the write one so @@ -484,31 +507,32 @@ status_t WorkerThreadSection::Execute(ReportRequestSet* requests) const { data->pipe.readFd().reset(); // If the worker side is finished, then return its error (which may overwrite - // our possible error -- but it's more interesting anyway). If not, then we timed out. + // our possible error -- but it's more interesting anyway). If not, then we timed out. { unique_lock<mutex> lock(data->lock); - if (!data->workerDone) { - // We timed out - timedOut = true; - } else { - if (data->workerError != NO_ERROR) { - err = data->workerError; - // TODO: Log this error into the incident report. - ALOGW("[%s] worker failed with error '%s'", this->name.string(), strerror(-err)); - } + if (data->workerError != NO_ERROR) { + err = data->workerError; + ALOGE("[%s] worker failed with error '%s'", this->name.string(), strerror(-err)); } + workerDone = data->workerDone; } - write_section_stats(requests->sectionStats(this->id), buffer); - if (timedOut || buffer.timedOut()) { - ALOGW("[%s] timed out", this->name.string()); + write_section_stats(stats, buffer); + if (err != NO_ERROR) { + char errMsg[128]; + snprintf(errMsg, 128, "[%s] failed with error '%s'", + this->name.string(), strerror(-err)); + stats->set_success(false); + stats->set_error_msg(errMsg); return NO_ERROR; } - - // TODO: There was an error with the command or buffering. Report that. For now - // just exit with a log messasge. - if (err != NO_ERROR) { - ALOGW("[%s] failed with error '%s'", this->name.string(), strerror(-err)); + if (buffer.truncated()) { + ALOGW("[%s] too large, truncating", this->name.string()); + // Do not write a truncated section. It won't pass through the PrivacyBuffer. + return NO_ERROR; + } + if (!workerDone || buffer.timedOut()) { + ALOGW("[%s] timed out", this->name.string()); return NO_ERROR; } @@ -617,14 +641,8 @@ status_t DumpsysSection::BlockingCall(int pipeWriteFd) const { sp<IBinder> service = defaultServiceManager()->checkService(mService); if (service == NULL) { - // Returning an error interrupts the entire incident report, so just - // log the failure. - // TODO: have a meta record inside the report that would log this - // failure inside the report, because the fact that we can't find - // the service is good data in and of itself. This is running in - // another thread so lock that carefully... ALOGW("DumpsysSection: Can't lookup service: %s", String8(mService).string()); - return NO_ERROR; + return NAME_NOT_FOUND; } service->dump(pipeWriteFd, mArgs); diff --git a/libs/incident/proto/android/os/metadata.proto b/libs/incident/proto/android/os/metadata.proto index f8f4e36b1e89..3b0e9c9aa17a 100644 --- a/libs/incident/proto/android/os/metadata.proto +++ b/libs/incident/proto/android/os/metadata.proto @@ -61,8 +61,10 @@ message IncidentMetadata { optional bool timed_out = 7; // true if the section is truncated. optional bool is_truncated = 8; + // message for debugging if there is an error. + optional string error_msg = 9; - // Next Tag: 9 + // Next Tag: 10; } repeated SectionStats sections = 6; |